DataCamp offer interactive courses related to R Programming. While some is review, it is helpful to see other perspectives on material. As well, DataCamp has some interesting materials on packages that I want to learn better (ggplot2, dplyr, ggvis, etc.). This document summarizes a few key insights from:
This document is currently split between _v003 and v_003_a due to the need to keep the number of DLL that it opens below the hard-coded maximum. This introductory section needs to be re-written, and the contents consolidated, at a future date.
The original DataCamp_Insights_v001 and DataCamp_Insights_v002 documents have been split for this document:
Chapter 1 - Dates and Times in R
Introduction to dates - including the built-in methods for R:
Why use dates?
What about times?
Why lubridate?
Example code includes:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
# The date R 3.0.0 was released
x <- "2013-04-03"
# Examine structure of x
str(x)
## chr "2013-04-03"
# Use as.Date() to interpret x as a date
x_date <- as.Date(x)
# Examine structure of x_date
str(x_date)
## Date[1:1], format: "2013-04-03"
# Store April 10 2014 as a Date
april_10_2014 <- as.Date("2014-04-10")
# Load the readr package
library(readr)
# Use read_csv() to import rversions.csv
releases <- read_csv("./RInputFiles/rversions.csv")
## Parsed with column specification:
## cols(
## major = col_integer(),
## minor = col_integer(),
## patch = col_integer(),
## date = col_date(format = ""),
## datetime = col_datetime(format = ""),
## time = col_time(format = ""),
## type = col_character()
## )
# Examine the structure of the date column
str(releases$date)
## Date[1:105], format: "1997-12-04" "1997-12-21" "1998-01-10" "1998-03-14" ...
# Load the anytime package
library(anytime)
# Various ways of writing Sep 10 2009
sep_10_2009 <- c("September 10 2009", "2009-09-10", "10 Sep 2009", "09-10-2009")
# Use anytime() to parse sep_10_2009
anytime(sep_10_2009)
## [1] "2009-09-10 CDT" "2009-09-10 CDT" "2009-09-10 CDT" "2009-09-10 CDT"
# Set the x axis to the date column
ggplot(releases, aes(x = date, y = type)) +
geom_line(aes(group = 1, color = factor(major)))
# Limit the axis to between 2010-01-01 and 2014-01-01
ggplot(releases, aes(x = date, y = type)) +
geom_line(aes(group = 1, color = factor(major))) +
xlim(as.Date("2010-01-01"), as.Date("2014-01-01"))
## Warning: Removed 87 rows containing missing values (geom_path).
# Specify breaks every ten years and labels with "%Y"
ggplot(releases, aes(x = date, y = type)) +
geom_line(aes(group = 1, color = factor(major))) +
scale_x_date(date_breaks = "10 years", date_labels = "%Y")
# Find the largest date
last_release_date <- max(releases$date)
# Filter row for last release
last_release <- filter(releases, date == last_release_date)
# Print last_release
last_release
## # A tibble: 1 x 7
## major minor patch date datetime time type
## <int> <int> <int> <date> <dttm> <time> <chr>
## 1 3 4 1 2017-06-30 2017-06-30 07:04:11 07:04 patch
# How long since last release?
Sys.Date() - last_release_date
## Time difference of 244 days
# Use as.POSIXct to enter the datetime
as.POSIXct("2010-10-01 12:12:00")
## [1] "2010-10-01 12:12:00 CDT"
# Use as.POSIXct again but set the timezone to `"America/Los_Angeles"`
as.POSIXct("2010-10-01 12:12:00", tz = "America/Los_Angeles")
## [1] "2010-10-01 12:12:00 PDT"
# Use read_csv to import rversions.csv
releases <- read_csv("./RInputFiles/rversions.csv")
## Parsed with column specification:
## cols(
## major = col_integer(),
## minor = col_integer(),
## patch = col_integer(),
## date = col_date(format = ""),
## datetime = col_datetime(format = ""),
## time = col_time(format = ""),
## type = col_character()
## )
# Examine structure of datetime column
str(releases$datetime)
## POSIXct[1:105], format: "1997-12-04 08:47:58" "1997-12-21 13:09:22" ...
# Import "cran-logs_2015-04-17.csv" with read_csv()
logs <- read_csv("./RInputFiles/cran-logs_2015-04-17.csv")
## Parsed with column specification:
## cols(
## datetime = col_datetime(format = ""),
## r_version = col_character(),
## country = col_character()
## )
# Print logs
logs
## # A tibble: 100,000 x 3
## datetime r_version country
## <dttm> <chr> <chr>
## 1 2015-04-16 22:40:19 3.1.3 CO
## 2 2015-04-16 09:11:04 3.1.3 GB
## 3 2015-04-16 17:12:37 3.1.3 DE
## 4 2015-04-18 12:34:43 3.2.0 GB
## 5 2015-04-16 04:49:18 3.1.3 PE
## 6 2015-04-16 06:40:44 3.1.3 TW
## 7 2015-04-16 00:21:36 3.1.3 US
## 8 2015-04-16 10:27:23 3.1.3 US
## 9 2015-04-16 01:59:43 3.1.3 SG
## 10 2015-04-18 15:41:32 3.2.0 CA
## # ... with 99,990 more rows
# Store the release time as a POSIXct object
release_time <- as.POSIXct("2015-04-16 07:13:33", tz = "UTC")
# When is the first download of 3.2.0?
logs %>%
filter(r_version == "3.2.0")
## # A tibble: 35,928 x 3
## datetime r_version country
## <dttm> <chr> <chr>
## 1 2015-04-18 12:34:43 3.2.0 GB
## 2 2015-04-18 15:41:32 3.2.0 CA
## 3 2015-04-18 14:58:41 3.2.0 IE
## 4 2015-04-18 16:44:45 3.2.0 US
## 5 2015-04-18 04:34:35 3.2.0 US
## 6 2015-04-18 22:29:45 3.2.0 CH
## 7 2015-04-17 16:21:06 3.2.0 US
## 8 2015-04-18 20:34:57 3.2.0 AT
## 9 2015-04-17 18:23:19 3.2.0 US
## 10 2015-04-18 03:00:31 3.2.0 US
## # ... with 35,918 more rows
# Examine histograms of downloads by version
ggplot(logs, aes(x = datetime)) +
geom_histogram() +
geom_vline(aes(xintercept = as.numeric(release_time)))+
facet_wrap(~ r_version, ncol = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Chapter 2 - Parsing and Manipulating Dates with lubridate
Parsing dates with lubridate:
Weather in Auckland (data from Weather Underground, METAR from Auckland airport):
Extracting parts of a datetime:
Rounding datetimes:
Example code includes:
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(readr)
library(dplyr)
library(ggplot2)
library(ggridges)
library(stringr)
# Parse x
x <- "2010 September 20th" # 2010-09-20
ymd(x)
## [1] "2010-09-20"
# Parse y
y <- "02.01.2010" # 2010-01-02
dmy(y)
## [1] "2010-01-02"
# Parse z
z <- "Sep, 12th 2010 14:00" # 2010-09-12T14:00
mdy_hm(z)
## [1] "2010-09-12 14:00:00 UTC"
# Specify an order string to parse x
x <- "Monday June 1st 2010 at 4pm"
parse_date_time(x, orders = "AmdyIp")
## [1] "2010-06-01 16:00:00 UTC"
# Specify order to include both "mdy" and "dmy"
two_orders <- c("October 7, 2001", "October 13, 2002", "April 13, 2003",
"17 April 2005", "23 April 2017")
parse_date_time(two_orders, orders = c("mdy", "dmy"))
## [1] "2001-10-07 UTC" "2002-10-13 UTC" "2003-04-13 UTC" "2005-04-17 UTC"
## [5] "2017-04-23 UTC"
# Specify order to include "dOmY", "OmY" and "Y"
short_dates <- c("11 December 1282", "May 1372", "1253")
parse_date_time(short_dates, orders = c("dOmY", "OmY", "Y"))
## [1] "1282-12-11 UTC" "1372-05-01 UTC" "1253-01-01 UTC"
# Import CSV with read_csv()
akl_daily_raw <- read_csv("./RInputFiles/akl_weather_daily.csv")
## Parsed with column specification:
## cols(
## date = col_character(),
## max_temp = col_integer(),
## min_temp = col_integer(),
## mean_temp = col_integer(),
## mean_rh = col_integer(),
## events = col_character(),
## cloud_cover = col_integer()
## )
# Print akl_daily_raw
akl_daily_raw
## # A tibble: 3,661 x 7
## date max_temp min_temp mean_temp mean_rh events cloud_cover
## <chr> <int> <int> <int> <int> <chr> <int>
## 1 2007-9-1 60 51 56 75 <NA> 4
## 2 2007-9-2 60 53 56 82 Rain 4
## 3 2007-9-3 57 51 54 78 <NA> 6
## 4 2007-9-4 64 50 57 80 Rain 6
## 5 2007-9-5 53 48 50 90 Rain 7
## 6 2007-9-6 57 42 50 69 <NA> 1
## 7 2007-9-7 59 41 50 77 <NA> 4
## 8 2007-9-8 59 46 52 80 <NA> 5
## 9 2007-9-9 55 50 52 88 Rain 7
## 10 2007-9-10 59 50 54 82 Rain 4
## # ... with 3,651 more rows
# Parse date
akl_daily <- akl_daily_raw %>%
mutate(date = ymd(date))
# Print akl_daily
akl_daily
## # A tibble: 3,661 x 7
## date max_temp min_temp mean_temp mean_rh events cloud_cover
## <date> <int> <int> <int> <int> <chr> <int>
## 1 2007-09-01 60 51 56 75 <NA> 4
## 2 2007-09-02 60 53 56 82 Rain 4
## 3 2007-09-03 57 51 54 78 <NA> 6
## 4 2007-09-04 64 50 57 80 Rain 6
## 5 2007-09-05 53 48 50 90 Rain 7
## 6 2007-09-06 57 42 50 69 <NA> 1
## 7 2007-09-07 59 41 50 77 <NA> 4
## 8 2007-09-08 59 46 52 80 <NA> 5
## 9 2007-09-09 55 50 52 88 Rain 7
## 10 2007-09-10 59 50 54 82 Rain 4
## # ... with 3,651 more rows
# Plot to check work
ggplot(akl_daily, aes(x = date, y = max_temp)) +
geom_line()
## Warning: Removed 1 rows containing missing values (geom_path).
# Import "akl_weather_hourly_2016.csv"
akl_hourly_raw <- read_csv("./RInputFiles/akl_weather_hourly_2016.csv")
## Parsed with column specification:
## cols(
## year = col_integer(),
## month = col_integer(),
## mday = col_integer(),
## time = col_time(format = ""),
## temperature = col_double(),
## weather = col_character(),
## conditions = col_character(),
## events = col_character(),
## humidity = col_integer(),
## date_utc = col_datetime(format = "")
## )
# Print akl_hourly_raw
akl_hourly_raw
## # A tibble: 17,454 x 10
## year month mday time temperature weather conditions events humidity
## <int> <int> <int> <time> <dbl> <chr> <chr> <chr> <int>
## 1 2016 1 1 00:00 68.0 Clear Clear <NA> 68
## 2 2016 1 1 00:30 68.0 Clear Clear <NA> 68
## 3 2016 1 1 01:00 68.0 Clear Clear <NA> 73
## 4 2016 1 1 01:30 68.0 Clear Clear <NA> 68
## 5 2016 1 1 02:00 68.0 Clear Clear <NA> 68
## 6 2016 1 1 02:30 68.0 Clear Clear <NA> 68
## 7 2016 1 1 03:00 68.0 Clear Clear <NA> 68
## 8 2016 1 1 03:30 68.0 Cloudy Partly Cl~ <NA> 68
## 9 2016 1 1 04:00 68.0 Cloudy Scattered~ <NA> 68
## 10 2016 1 1 04:30 66.2 Cloudy Partly Cl~ <NA> 73
## # ... with 17,444 more rows, and 1 more variable: date_utc <dttm>
# Use make_date() to combine year, month and mday
akl_hourly <- akl_hourly_raw %>%
mutate(date = make_date(year = year, month = month, day = mday))
# Parse datetime_string
akl_hourly <- akl_hourly %>%
mutate(
datetime_string = paste(date, time, sep = "T"),
datetime = ymd_hms(datetime_string)
)
# Print date, time and datetime columns of akl_hourly
akl_hourly %>% select(date, time, datetime)
## # A tibble: 17,454 x 3
## date time datetime
## <date> <time> <dttm>
## 1 2016-01-01 00:00 2016-01-01 00:00:00
## 2 2016-01-01 00:30 2016-01-01 00:30:00
## 3 2016-01-01 01:00 2016-01-01 01:00:00
## 4 2016-01-01 01:30 2016-01-01 01:30:00
## 5 2016-01-01 02:00 2016-01-01 02:00:00
## 6 2016-01-01 02:30 2016-01-01 02:30:00
## 7 2016-01-01 03:00 2016-01-01 03:00:00
## 8 2016-01-01 03:30 2016-01-01 03:30:00
## 9 2016-01-01 04:00 2016-01-01 04:00:00
## 10 2016-01-01 04:30 2016-01-01 04:30:00
## # ... with 17,444 more rows
# Plot to check work
ggplot(akl_hourly, aes(x = datetime, y = temperature)) +
geom_line()
# Examine the head() of release_time
releases <- read_csv("./RInputFiles/rversions.csv")
## Parsed with column specification:
## cols(
## major = col_integer(),
## minor = col_integer(),
## patch = col_integer(),
## date = col_date(format = ""),
## datetime = col_datetime(format = ""),
## time = col_time(format = ""),
## type = col_character()
## )
release_time <- releases %>% pull(datetime)
head(release_time)
## [1] "1997-12-04 08:47:58 UTC" "1997-12-21 13:09:22 UTC"
## [3] "1998-01-10 00:31:55 UTC" "1998-03-14 19:25:55 UTC"
## [5] "1998-05-02 07:58:17 UTC" "1998-06-14 12:56:20 UTC"
# Examine the head() of the months of release_time
head(month(release_time))
## [1] 12 12 1 3 5 6
# Extract the month of releases
month(release_time) %>% table()
## .
## 1 2 3 4 5 6 7 8 9 10 11 12
## 5 6 8 18 5 16 4 7 2 15 6 13
# Extract the year of releases
year(release_time) %>% table()
## .
## 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011
## 2 10 9 6 6 5 5 4 4 4 4 6 5 4 6
## 2012 2013 2014 2015 2016 2017
## 4 4 4 5 5 3
# How often is the hour before 12 (noon)?
mean(hour(release_time) < 12)
## [1] 0.752381
# How often is the release in am?
mean(am(release_time))
## [1] 0.752381
# Use wday() to tabulate release by day of the week
wday(releases$datetime) %>% table()
## .
## 1 2 3 4 5 6 7
## 3 29 9 12 18 31 3
# Add label = TRUE to make table more readable
wday(releases$datetime, label=TRUE) %>% table()
## .
## Sun Mon Tue Wed Thu Fri Sat
## 3 29 9 12 18 31 3
# Create column wday to hold labelled week days
releases$wday <- wday(releases$datetime, label=TRUE)
# Plot barchart of weekday by type of release
ggplot(releases, aes(x=wday)) +
geom_bar() +
facet_wrap(~ type, ncol = 1, scale = "free_y")
# Add columns for year, yday and month
akl_daily <- akl_daily %>%
mutate(
year = year(date),
yday = yday(date),
month = month(date, label=TRUE))
# Plot max_temp by yday for all years
ggplot(akl_daily, aes(x = yday, y = max_temp)) +
geom_line(aes(group = year), alpha = 0.5)
## Warning: Removed 1 rows containing missing values (geom_path).
# Examine distribtion of max_temp by month
ggplot(akl_daily, aes(x = max_temp, y = month, height = ..density..)) +
geom_density_ridges(stat = "density")
## Warning: Removed 10 rows containing non-finite values (stat_density).
# Create new columns hour, month and rainy
akl_hourly <- akl_hourly %>%
mutate(
hour = hour(datetime),
month = month(datetime, label=TRUE),
rainy = (weather == "Precipitation")
)
# Filter for hours between 8am and 10pm (inclusive)
akl_day <- akl_hourly %>%
filter(hour >= 8, hour <= 22)
# Summarise for each date if there is any rain
rainy_days <- akl_day %>%
group_by(month, date) %>%
summarise(
any_rain = any(rainy)
)
# Summarise for each month, the number of days with rain
rainy_days %>%
summarise(
days_rainy = sum(any_rain)
)
## # A tibble: 12 x 2
## month days_rainy
## <ord> <int>
## 1 Jan 15
## 2 Feb 13
## 3 Mar 12
## 4 Apr 15
## 5 May 21
## 6 Jun 19
## 7 Jul 22
## 8 Aug 16
## 9 Sep 25
## 10 Oct 20
## 11 Nov 19
## 12 Dec 11
r_3_4_1 <- ymd_hms("2016-05-03 07:13:28 UTC")
# Round down to day
floor_date(r_3_4_1, unit = "day")
## [1] "2016-05-03 UTC"
# Round to nearest 5 minutes
round_date(r_3_4_1, unit = "5 minutes")
## [1] "2016-05-03 07:15:00 UTC"
# Round up to week
ceiling_date(r_3_4_1, unit = "week")
## [1] "2016-05-08 UTC"
# Subtract r_3_4_1 rounded down to day
r_3_4_1 - floor_date(r_3_4_1, unit = "day")
## Time difference of 7.224444 hours
# Create day_hour, datetime rounded down to hour
akl_hourly <- akl_hourly %>%
mutate(
day_hour = floor_date(datetime, unit = "hour")
)
# Count observations per hour
akl_hourly %>%
count(day_hour)
## # A tibble: 8,770 x 2
## day_hour n
## <dttm> <int>
## 1 2016-01-01 00:00:00 2
## 2 2016-01-01 01:00:00 2
## 3 2016-01-01 02:00:00 2
## 4 2016-01-01 03:00:00 2
## 5 2016-01-01 04:00:00 2
## 6 2016-01-01 05:00:00 2
## 7 2016-01-01 06:00:00 2
## 8 2016-01-01 07:00:00 2
## 9 2016-01-01 08:00:00 2
## 10 2016-01-01 09:00:00 2
## # ... with 8,760 more rows
# Find day_hours with n != 2
akl_hourly %>%
count(day_hour) %>%
filter(n != 2) %>%
arrange(desc(n))
## # A tibble: 92 x 2
## day_hour n
## <dttm> <int>
## 1 2016-04-03 02:00:00 4
## 2 2016-09-25 00:00:00 4
## 3 2016-06-26 09:00:00 1
## 4 2016-09-01 23:00:00 1
## 5 2016-09-02 01:00:00 1
## 6 2016-09-04 11:00:00 1
## 7 2016-09-04 16:00:00 1
## 8 2016-09-04 17:00:00 1
## 9 2016-09-05 00:00:00 1
## 10 2016-09-05 15:00:00 1
## # ... with 82 more rows
Chapter 3 - Arithmetic with Dates and Times
Taking differences of datetimes:
Time spans - difficult because they do not have a constant meaning (e.g., impact of daylight savings time):
Intervals - third option in lubridate for storing times:
Example code includes:
# The date of landing and moment of step
date_landing <- mdy("July 20, 1969")
moment_step <- mdy_hms("July 20, 1969, 02:56:15", tz = "UTC")
# How many days since the first man on the moon?
difftime(today(), date_landing, units = "days")
## Time difference of 17756 days
# How many seconds since the first man on the moon?
difftime(now(), moment_step, units = "secs")
## Time difference of 1534157851 secs
# Three dates
mar_11 <- ymd_hms("2017-03-11 12:00:00",
tz = "America/Los_Angeles")
mar_12 <- ymd_hms("2017-03-12 12:00:00",
tz = "America/Los_Angeles")
mar_13 <- ymd_hms("2017-03-13 12:00:00",
tz = "America/Los_Angeles")
# Difference between mar_13 and mar_12 in seconds
difftime(mar_13, mar_12, units = "secs")
## Time difference of 86400 secs
# Difference between mar_12 and mar_11 in seconds
difftime(mar_12, mar_11, units = "secs")
## Time difference of 82800 secs
# Add a period of one week to mon_2pm
mon_2pm <- dmy_hm("27 Aug 2018 14:00")
mon_2pm + weeks(1)
## [1] "2018-09-03 14:00:00 UTC"
# Add a duration of 81 hours to tue_9am
tue_9am <- dmy_hm("28 Aug 2018 9:00")
tue_9am + dhours(81)
## [1] "2018-08-31 18:00:00 UTC"
# Subtract a period of five years from today()
today() - years(5)
## [1] "2013-03-01"
# Subtract a duration of five years from today()
today() - dyears(5)
## [1] "2013-03-02"
# Time of North American Eclipse 2017
eclipse_2017 <- ymd_hms("2017-08-21 18:26:40")
# Duration of 29 days, 12 hours, 44 mins and 3 secs
synodic <- ddays(29) + dhours(12) + dminutes(44) + dseconds(3)
# 223 synodic months
saros <- 223 * synodic
# Add saros to eclipse_2017
eclipse_2017 + saros
## [1] "2035-09-02 02:09:49 UTC"
# Add a period of 8 hours to today
today_8am <- today() + hours(8)
# Sequence of two weeks from 1 to 26
every_two_weeks <- 1:26 * weeks(2)
# Create datetime for every two weeks for a year
today_8am + every_two_weeks
## [1] "2018-03-15 08:00:00 UTC" "2018-03-29 08:00:00 UTC"
## [3] "2018-04-12 08:00:00 UTC" "2018-04-26 08:00:00 UTC"
## [5] "2018-05-10 08:00:00 UTC" "2018-05-24 08:00:00 UTC"
## [7] "2018-06-07 08:00:00 UTC" "2018-06-21 08:00:00 UTC"
## [9] "2018-07-05 08:00:00 UTC" "2018-07-19 08:00:00 UTC"
## [11] "2018-08-02 08:00:00 UTC" "2018-08-16 08:00:00 UTC"
## [13] "2018-08-30 08:00:00 UTC" "2018-09-13 08:00:00 UTC"
## [15] "2018-09-27 08:00:00 UTC" "2018-10-11 08:00:00 UTC"
## [17] "2018-10-25 08:00:00 UTC" "2018-11-08 08:00:00 UTC"
## [19] "2018-11-22 08:00:00 UTC" "2018-12-06 08:00:00 UTC"
## [21] "2018-12-20 08:00:00 UTC" "2019-01-03 08:00:00 UTC"
## [23] "2019-01-17 08:00:00 UTC" "2019-01-31 08:00:00 UTC"
## [25] "2019-02-14 08:00:00 UTC" "2019-02-28 08:00:00 UTC"
jan_31 <- ymd("2018-01-31")
# A sequence of 1 to 12 periods of 1 month
month_seq <- 1:12 * months(1)
# Add 1 to 12 months to jan_31
jan_31 + month_seq
## [1] NA "2018-03-31" NA "2018-05-31" NA
## [6] "2018-07-31" "2018-08-31" NA "2018-10-31" NA
## [11] "2018-12-31" "2019-01-31"
# Replace + with %m+%
jan_31 %m+% month_seq
## [1] "2018-02-28" "2018-03-31" "2018-04-30" "2018-05-31" "2018-06-30"
## [6] "2018-07-31" "2018-08-31" "2018-09-30" "2018-10-31" "2018-11-30"
## [11] "2018-12-31" "2019-01-31"
# Replace + with %m-%
jan_31 %m-% month_seq
## [1] "2017-12-31" "2017-11-30" "2017-10-31" "2017-09-30" "2017-08-31"
## [6] "2017-07-31" "2017-06-30" "2017-05-31" "2017-04-30" "2017-03-31"
## [11] "2017-02-28" "2017-01-31"
# Create monarchs
mNames <- c('Elizabeth II' ,'Victoria' ,'George V' ,'George III' ,'George VI' ,'George IV' ,'Edward VII' ,'William IV' ,'Edward VIII' ,'George III(also United Kingdom)' ,'George II' ,'George I' ,'Anne' ,'Henry III' ,'Edward III' ,'Elizabeth I' ,'Henry VI' ,'Henry VI' ,'Æthelred II' ,'Æthelred II' ,'Henry VIII' ,'Charles II' ,'Henry I' ,'Henry II(co-ruler with Henry the Young King)' ,'Edward I' ,'Alfred the Great' ,'Edward the Elder' ,'Charles I' ,'Henry VII' ,'Edward the Confessor' ,'Richard II' ,'James I' ,'Edward IV' ,'Edward IV' ,'William I' ,'Edward II' ,'Cnut' ,'Stephen' ,'Stephen' ,'John' ,'Edgar I' ,'Æthelstan' ,'Henry IV' ,'William III(co-ruler with Mary II)' ,'Henry the Young King(co-ruler with Henry II)' ,'William II' ,'Richard I' ,'Eadred' ,'Henry V' ,'Edmund I' ,'Edward VI' ,'Mary II(co-ruler with William III)' ,'Mary I' ,'Anne(also Kingdom of Great Britain)' ,'Eadwig' ,'James II' ,'Edward the Martyr' ,'Harold I' ,'Harthacnut' ,'Richard III' ,'Louis (disputed)' ,'Harold II' ,'Edmund II' ,'Matilda (disputed)' ,'Edward V' ,'Edgar II' ,'Sweyn Forkbeard' ,'Jane (disputed)' ,'James VI' ,'William I' ,'Constantine II' ,'David II' ,'Alexander III' ,'Malcolm III' ,'Alexander II' ,'James I' ,'Malcolm II' ,'James V' ,'David I' ,'James III' ,'Charles II' ,'Charles II' ,'James IV' ,'Mary I' ,'Charles I' ,'Kenneth II' ,'James II' ,'Robert I' ,'Robert II' ,'Alexander I' ,'Macbeth' ,'Robert III' ,'Constantine I' ,'Kenneth MacAlpin' ,'William II' ,'Malcolm IV' ,'Giric(co-ruler with Eochaid?)' ,'Donald II' ,'Malcolm I' ,'Edgar' ,'Kenneth III' ,'Indulf' ,'Duncan I' ,'Mary II' ,'Amlaíb' ,'Anne(also Kingdom of Great Britain)' ,'Dub' ,'Cuilén' ,'Domnall mac Ailpín' ,'James VII' ,'Margaret' ,'John Balliol' ,'Donald III' ,'Constantine III' ,'Áed mac Cináeda' ,'Lulach' ,'Duncan II' ,'Ruaidrí Ua Conchobair' ,'Edward Bruce (disputed)' ,'Brian Ua Néill (disputed)' ,'Gruffudd ap Cynan' ,'Llywelyn the Great' ,'Owain Gwynedd' ,'Dafydd ab Owain Gwynedd' ,'Hywel ab Owain Gwynedd' ,'Llywelyn ap Gruffudd' ,'Owain Glyndwr (disputed)' ,'Owain Goch ap Gruffydd' ,'Owain Lawgoch (disputed)' ,'Dafydd ap Llywelyn' ,'Dafydd ap Gruffydd')
mDominion <- c('United Kingdom' ,'United Kingdom' ,'United Kingdom' ,'United Kingdom' ,'United Kingdom' ,'United Kingdom' ,'United Kingdom' ,'United Kingdom' ,'United Kingdom' ,'Great Britain' ,'Great Britain' ,'Great Britain' ,'Great Britain' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Ireland' ,'Ireland' ,'Ireland' ,'Gwynedd' ,'Gwynedd' ,'Gwynedd' ,'Gwynedd' ,'Gwynedd' ,'Wales' ,'Wales' ,'Wales' ,'Wales' ,'Wales' ,'Wales')
mFrom <- c('1952-02-06' ,'1837-06-20' ,'1910-05-06' ,'1801-01-01' ,'1936-12-11' ,'1820-01-29' ,'1901-01-22' ,'1830-06-26' ,'1936-01-20' ,'1760-10-25' ,'1727-06-22' ,'1714-08-01' ,'1707-05-01' ,'NA' ,'1327-01-25' ,'1558-11-17' ,'1422-08-31' ,'1470-10-31' ,'978-03-18' ,'1014-02-03' ,'1509-04-22' ,'1649-01-30' ,'1100-08-03' ,'1154-10-25' ,'1272-11-20' ,'871-04-24' ,'899-10-27' ,'1625-03-27' ,'1485-08-22' ,'1042-06-08' ,'1377-06-22' ,'1603-03-24' ,'1461-03-04' ,'1471-04-11' ,'1066-12-12' ,'1307-07-07' ,'1016-11-30' ,'1135-12-22' ,'1141-11-01' ,'1199-04-06' ,'959-10-01' ,'924-08-02' ,'1399-09-29' ,'1689-02-13' ,'1170-06-14' ,'1087-09-09' ,'1189-07-06' ,'946-05-26' ,'1413-03-21' ,'939-10-27' ,'1547-01-28' ,'1689-02-13' ,'1553-07-19' ,'1702-03-08' ,'955-11-23' ,'1685-02-06' ,'975-07-09' ,'1037-11-12' ,'1040-03-17' ,'1483-06-26' ,'1216-06-14' ,'1066-01-05' ,'1016-04-23' ,'1141-04-07' ,'1483-04-09' ,'1066-10-15' ,'1013-12-25' ,'1553-07-10' ,'1567-07-24' ,'1165-12-09' ,'900-01-01' ,'1329-06-07' ,'1249-07-06' ,'1058-03-17' ,'1214-12-04' ,'1406-04-04' ,'1005-03-25' ,'1513-09-09' ,'1124-04-23' ,'1460-08-03' ,'1649-01-30' ,'1660-05-29' ,'1488-06-11' ,'1542-12-14' ,'1625-03-27' ,'971-01-01' ,'1437-02-21' ,'1306-03-25' ,'1371-02-22' ,'1107-01-08' ,'1040-08-14' ,'1390-04-19' ,'862-01-01' ,'843-01-01' ,'1689-05-11' ,'1153-05-24' ,'878-01-01' ,'889-01-01' ,'943-01-01' ,'1097-01-01' ,'997-01-01' ,'954-01-01' ,'1034-11-25' ,'1689-04-11' ,'971-01-01' ,'1702-03-08' ,'962-01-01' ,'NA' ,'858-01-01' ,'1685-02-06' ,'1286-11-25' ,'1292-11-17' ,'1093-11-13' ,'1095-01-01' ,'877-01-01' ,'1057-08-15' ,'1094-05-01' ,'1166-01-01' ,'1315-06-01' ,'1258-01-01' ,'1081-01-01' ,'1195-01-01' ,'1137-01-01' ,'1170-01-01' ,'1170-01-01' ,'1253-01-01' ,'1400-09-16' ,'1246-02-25' ,'1372-05-01' ,'1240-04-12' ,'1282-12-11')
mTo <- c('2018-02-08' ,'1901-01-22' ,'1936-01-20' ,'1820-01-29' ,'1952-02-06' ,'1830-06-26' ,'1910-05-06' ,'1837-06-20' ,'1936-12-11' ,'1801-01-01' ,'1760-10-25' ,'1727-06-11' ,'1714-08-01' ,'1272-11-16' ,'1377-06-21' ,'1603-03-24' ,'1461-03-04' ,'1471-04-11' ,'1013-12-25' ,'1016-04-23' ,'1547-01-28' ,'1685-02-06' ,'1135-12-01' ,'1189-07-06' ,'1307-07-07' ,'899-10-26' ,'924-07-17' ,'1649-01-30' ,'1509-04-21' ,'1066-01-05' ,'1399-09-29' ,'1625-03-27' ,'1470-10-03' ,'1483-04-09' ,'1087-09-09' ,'1327-01-20' ,'1035-11-12' ,'1141-04-07' ,'1154-10-25' ,'1216-10-19' ,'975-07-08' ,'939-10-27' ,'1413-03-20' ,'1702-03-08' ,'1183-06-11' ,'1100-08-02' ,'1199-04-06' ,'955-11-23' ,'1422-08-31' ,'946-05-26' ,'1553-07-06' ,'1694-12-28' ,'1558-11-17' ,'1707-04-30' ,'959-10-01' ,'1688-12-11' ,'978-03-18' ,'1040-03-17' ,'1042-06-08' ,'1485-08-22' ,'1217-09-22' ,'1066-10-14' ,'1016-11-30' ,'1141-11-01' ,'1483-06-26' ,'1066-12-17' ,'1014-02-03' ,'1553-07-19' ,'1625-03-27' ,'1214-12-04' ,'943-01-01' ,'1371-02-22' ,'1286-03-19' ,'1093-11-13' ,'1249-07-06' ,'1437-02-21' ,'1034-11-25' ,'1542-12-14' ,'1153-05-24' ,'1488-06-11' ,'1651-09-03' ,'1685-02-06' ,'1513-09-09' ,'1567-07-24' ,'1649-01-30' ,'995-01-01' ,'1460-08-03' ,'1329-06-07' ,'1390-04-19' ,'1124-04-23' ,'1057-08-15' ,'1406-04-04' ,'877-01-01' ,'858-02-13' ,'1702-03-08' ,'1165-12-09' ,'889-01-01' ,'900-01-01' ,'954-01-01' ,'1107-01-08' ,'1005-03-25' ,'962-01-01' ,'1040-08-14' ,'1694-12-28' ,'977-01-01' ,'1707-04-30' ,'NA' ,'971-01-01' ,'862-04-13' ,'1688-12-11' ,'1290-09-26' ,'1296-07-10' ,'1097-01-01' ,'1097-01-01' ,'878-01-01' ,'1058-03-17' ,'1094-11-12' ,'1193-01-01' ,'1318-10-14' ,'1260-01-01' ,'1137-01-01' ,'1240-04-11' ,'1170-01-01' ,'1195-01-01' ,'1170-01-01' ,'1282-12-11' ,'1416-01-01' ,'1255-01-01' ,'1378-07-01' ,'1246-02-25' ,'1283-10-03')
padMDate <- function(x) {
if (is.na(x[1]) | x[1] == "NA") {
NA
} else {
paste0(c(str_pad(x[1], 4, pad="0"), x[2], x[3]), collapse="-")
}
}
monarchs <- tibble::tibble(name=mNames, dominion=mDominion,
from=ymd(sapply(str_split(mFrom, "-"), FUN=padMDate)),
to=ymd(sapply(str_split(mTo, "-"), FUN=padMDate))
)
# Print monarchs
monarchs
## # A tibble: 131 x 4
## name dominion from to
## <chr> <chr> <date> <date>
## 1 Elizabeth II United Kingdom 1952-02-06 2018-02-08
## 2 Victoria United Kingdom 1837-06-20 1901-01-22
## 3 George V United Kingdom 1910-05-06 1936-01-20
## 4 George III United Kingdom 1801-01-01 1820-01-29
## 5 George VI United Kingdom 1936-12-11 1952-02-06
## 6 George IV United Kingdom 1820-01-29 1830-06-26
## 7 Edward VII United Kingdom 1901-01-22 1910-05-06
## 8 William IV United Kingdom 1830-06-26 1837-06-20
## 9 Edward VIII United Kingdom 1936-01-20 1936-12-11
## 10 George III(also United Kingdom) Great Britain 1760-10-25 1801-01-01
## # ... with 121 more rows
# Create an interval for reign
monarchs <- monarchs %>%
mutate(reign = from %--% to)
# Find the length of reign, and arrange
monarchs %>%
mutate(length = int_length(reign)) %>%
arrange(desc(length)) %>%
select(name, length, dominion)
## # A tibble: 131 x 3
## name length dominion
## <chr> <dbl> <chr>
## 1 Elizabeth II 2083017600 United Kingdom
## 2 Victoria 2006726400 United Kingdom
## 3 James VI 1820102400 Scotland
## 4 Gruffudd ap Cynan 1767139200 Gwynedd
## 5 Edward III 1590624000 England
## 6 William I 1545868800 Scotland
## 7 Llywelyn the Great 1428796800 Gwynedd
## 8 Elizabeth I 1399507200 England
## 9 Constantine II 1356912000 Scotland
## 10 David II 1316304000 Scotland
## # ... with 121 more rows
# Print halleys
pDate <- c('66-01-26', '141-03-25', '218-04-06', '295-04-07', '374-02-13', '451-07-03', '530-11-15', '607-03-26', '684-11-26', '760-06-10', '837-02-25', '912-07-27', '989-09-02', '1066-03-25', '1145-04-19', '1222-09-10', '1301-10-22', '1378-11-09', '1456-01-08', '1531-08-26', '1607-10-27', '1682-09-15', '1758-03-13', '1835-11-16', '1910-04-20', '1986-02-09', '2061-07-28')
sDate <- c('66-01-25', '141-03-22', '218-04-06', '295-04-07', '374-02-13', '451-06-28', '530-09-27', '607-03-15', '684-10-02', '760-05-20', '837-02-25', '912-07-18', '989-09-02', '1066-01-01', '1145-04-15', '1222-09-10', '1301-10-22', '1378-11-09', '1456-01-08', '1531-08-26', '1607-10-27', '1682-09-15', '1758-03-13', '1835-08-01', '1910-04-20', '1986-02-09', '2061-07-28')
eDate <- c('66-01-26', '141-03-25', '218-05-17', '295-04-20', '374-02-16', '451-07-03', '530-11-15', '607-03-26', '684-11-26', '760-06-10', '837-02-28', '912-07-27', '989-09-05', '1066-03-25', '1145-04-19', '1222-09-28', '1301-10-31', '1378-11-14', '1456-06-09', '1531-08-26', '1607-10-27', '1682-09-15', '1758-12-25', '1835-11-16', '1910-05-20', '1986-02-09', '2061-07-28')
halleys <- tibble::tibble(perihelion_date=ymd(sapply(str_split(pDate, "-"), FUN=padMDate)),
start_date=ymd(sapply(str_split(sDate, "-"), FUN=padMDate)),
end_date=ymd(sapply(str_split(eDate, "-"), FUN=padMDate))
)
# New column for interval from start to end date
halleys <- halleys %>%
mutate(visible = start_date %--% end_date)
# The visitation of 1066
halleys_1066 <- halleys[14, ]
# Monarchs in power on perihelion date
monarchs %>%
filter(halleys_1066$perihelion_date %within% reign) %>%
select(name, from, to, dominion)
## # A tibble: 2 x 4
## name from to dominion
## <chr> <date> <date> <chr>
## 1 Harold II 1066-01-05 1066-10-14 England
## 2 Malcolm III 1058-03-17 1093-11-13 Scotland
# Monarchs whose reign overlaps visible time
monarchs %>%
filter(int_overlaps(halleys_1066$visible, reign)) %>%
select(name, from, to, dominion)
## # A tibble: 3 x 4
## name from to dominion
## <chr> <date> <date> <chr>
## 1 Edward the Confessor 1042-06-08 1066-01-05 England
## 2 Harold II 1066-01-05 1066-10-14 England
## 3 Malcolm III 1058-03-17 1093-11-13 Scotland
# New columns for duration and period
monarchs <- monarchs %>%
mutate(
duration = as.duration(reign),
period = as.period(reign))
# Examine results
monarchs %>%
select(name, duration, period) %>%
head(10) %>%
print.data.frame()
## name duration
## 1 Elizabeth II 2083017600s (~66.01 years)
## 2 Victoria 2006726400s (~63.59 years)
## 3 George V 811296000s (~25.71 years)
## 4 George III 601948800s (~19.07 years)
## 5 George VI 478224000s (~15.15 years)
## 6 George IV 328406400s (~10.41 years)
## 7 Edward VII 292982400s (~9.28 years)
## 8 William IV 220406400s (~6.98 years)
## 9 Edward VIII 28166400s (~46.57 weeks)
## 10 George III(also United Kingdom) 1268092800s (~40.18 years)
## period
## 1 66y 0m 2d 0H 0M 0S
## 2 63y 7m 2d 0H 0M 0S
## 3 25y 8m 14d 0H 0M 0S
## 4 19y 0m 28d 0H 0M 0S
## 5 15y 1m 26d 0H 0M 0S
## 6 10y 4m 28d 0H 0M 0S
## 7 9y 3m 14d 0H 0M 0S
## 8 6y 11m 25d 0H 0M 0S
## 9 10m 21d 0H 0M 0S
## 10 40y 2m 7d 0H 0M 0S
Chapter 4 - Problems in Practice
Time zones - ways to keep track of times in different locations (can pose analysis challenges):
Importing and exporting datetimes:
Wrap-up:
Example code includes:
# Game2: CAN vs NZL in Edmonton
game2 <- mdy_hm("June 11 2015 19:00")
# Game3: CHN vs NZL in Winnipeg
game3 <- mdy_hm("June 15 2015 18:30")
# Set the timezone to "America/Edmonton"
game2_local <- force_tz(game2, tzone = "America/Edmonton")
game2_local
## [1] "2015-06-11 19:00:00 MDT"
# Set the timezone to "America/Winnipeg"
game3_local <- force_tz(game3, tzone = "America/Winnipeg")
game3_local
## [1] "2015-06-15 18:30:00 CDT"
# How long does the team have to rest?
as.period(game2_local %--% game3_local)
## [1] "3d 22H 30M 0S"
# What time is game2_local in NZ?
with_tz(game2_local, tzone = "Pacific/Auckland")
## [1] "2015-06-12 13:00:00 NZST"
# What time is game2_local in Corvallis, Oregon?
with_tz(game2_local, tzone = "America/Los_Angeles")
## [1] "2015-06-11 18:00:00 PDT"
# What time is game3_local in NZ?
with_tz(game3_local, tzone = "Pacific/Auckland")
## [1] "2015-06-16 11:30:00 NZST"
# Examine datetime and date_utc columns
head(akl_hourly$datetime)
## [1] "2016-01-01 00:00:00 UTC" "2016-01-01 00:30:00 UTC"
## [3] "2016-01-01 01:00:00 UTC" "2016-01-01 01:30:00 UTC"
## [5] "2016-01-01 02:00:00 UTC" "2016-01-01 02:30:00 UTC"
head(akl_hourly$date_utc)
## [1] "2015-12-31 11:00:00 UTC" "2015-12-31 11:30:00 UTC"
## [3] "2015-12-31 12:00:00 UTC" "2015-12-31 12:30:00 UTC"
## [5] "2015-12-31 13:00:00 UTC" "2015-12-31 13:30:00 UTC"
# Force datetime to Pacific/Auckland
akl_hourly <- akl_hourly %>%
mutate(
datetime = force_tz(datetime, tzone = "Pacific/Auckland"))
# Reexamine datetime
head(akl_hourly$datetime)
## [1] "2016-01-01 00:00:00 NZDT" "2016-01-01 00:30:00 NZDT"
## [3] "2016-01-01 01:00:00 NZDT" "2016-01-01 01:30:00 NZDT"
## [5] "2016-01-01 02:00:00 NZDT" "2016-01-01 02:30:00 NZDT"
# Are datetime and date_utc the same moments
table(akl_hourly$datetime - akl_hourly$date_utc)
##
## -82800 0 3600
## 2 17450 2
# Import auckland hourly data
akl_hourly <- read_csv("./RInputFiles/akl_weather_hourly_2016.csv")
## Parsed with column specification:
## cols(
## year = col_integer(),
## month = col_integer(),
## mday = col_integer(),
## time = col_time(format = ""),
## temperature = col_double(),
## weather = col_character(),
## conditions = col_character(),
## events = col_character(),
## humidity = col_integer(),
## date_utc = col_datetime(format = "")
## )
# Examine structure of time column
str(akl_hourly$time)
## Classes 'hms', 'difftime' atomic [1:17454] 0 1800 3600 5400 7200 9000 10800 12600 14400 16200 ...
## ..- attr(*, "units")= chr "secs"
# Examine head of time column
head(akl_hourly$time)
## 00:00:00
## 00:30:00
## 01:00:00
## 01:30:00
## 02:00:00
## 02:30:00
# A plot using just time
ggplot(akl_hourly, aes(x = time, y = temperature)) +
geom_line(aes(group = make_date(year, month, mday)), alpha = 0.2)
library(microbenchmark)
library(fasttime)
# Examine structure of dates
dates <- paste0(gsub(" ", "T", as.character(akl_hourly$date_utc)), "Z")
str(dates)
## chr [1:17454] "2015-12-31T11:00:00Z" "2015-12-31T11:30:00Z" ...
# Use fastPOSIXct() to parse dates
fastPOSIXct(dates) %>% str()
## POSIXct[1:17454], format: "2015-12-31 05:00:00" "2015-12-31 05:30:00" ...
# Compare speed of fastPOSIXct() to ymd_hms()
microbenchmark(
ymd_hms = ymd_hms(dates),
fasttime = fastPOSIXct(dates),
times = 20)
## Unit: milliseconds
## expr min lq mean median uq max neval
## ymd_hms 20.569690 23.806649 29.964983 25.036496 30.639395 89.57849 20
## fasttime 1.568742 1.672562 2.583449 1.692298 3.094651 7.64554 20
## cld
## b
## a
# Head of dates
head(dates)
## [1] "2015-12-31T11:00:00Z" "2015-12-31T11:30:00Z" "2015-12-31T12:00:00Z"
## [4] "2015-12-31T12:30:00Z" "2015-12-31T13:00:00Z" "2015-12-31T13:30:00Z"
# Parse dates with fast_strptime
fast_strptime(dates,
format = "%Y-%m-%dT%H:%M:%SZ") %>% str()
## POSIXlt[1:17454], format: "2015-12-31 11:00:00" "2015-12-31 11:30:00" ...
# Comparse speed to ymd_hms() and fasttime
microbenchmark(
ymd_hms = ymd_hms(dates),
fasttime = fastPOSIXct(dates),
fast_strptime = fast_strptime(dates,
format = "%Y-%m-%dT%H:%M:%SZ"),
times = 20)
## Unit: milliseconds
## expr min lq mean median uq max
## ymd_hms 19.194772 23.379725 29.762199 25.412298 37.474510 54.292484
## fasttime 1.586506 1.645520 2.114268 1.679074 2.686282 3.638422
## fast_strptime 1.277810 1.342154 1.570261 1.373340 1.516634 2.811417
## neval cld
## 20 b
## 20 a
## 20 a
finished <- "I finished 'Dates and Times in R' on Thursday, September 20, 2017!"
# Create a stamp based on "Sep 20 2017"
date_stamp <- stamp("September 20, 2017", orders="mdy")
## Multiple formats matched: "%Om %d, %Y"(1), "%B %d, %Y"(1)
## Using: "%B %d, %Y"
# Print date_stamp
date_stamp
## function (x, locale = "English_United States.1252")
## {
## {
## old_lc_time <- Sys.getlocale("LC_TIME")
## if (old_lc_time != locale) {
## Sys.setlocale("LC_TIME", locale)
## on.exit(Sys.setlocale("LC_TIME", old_lc_time))
## }
## }
## format(x, format = "%B %d, %Y")
## }
## <environment: 0x0000000011a645b8>
# Call date_stamp on today()
date_stamp(today())
## [1] "March 01, 2018"
# Create and call a stamp based on "09/20/2017"
stamp("09/20/2017", orders="mdy")(today())
## Multiple formats matched: "%Om/%d/%Y"(1), "%m/%d/%Y"(1)
## Using: "%Om/%d/%Y"
## [1] "03/01/2018"
# Use string finished for stamp()
stamp(finished, orders="amdy")(today())
## Multiple formats matched: "I finished 'Dates and Times in R' on %A, %B %d, %Y!"(1), "I finished 'Dates and Times in R' on %A, %Om %d, %Y!"(0)
## Using: "I finished 'Dates and Times in R' on %A, %B %d, %Y!"
## [1] "I finished 'Dates and Times in R' on Thursday, March 01, 2018!"
Chapter 1 - Working with Increasingly Large Data Sets
What is scalable data processing?:
Working with “out of core” objects using the Bigmemory Project:
References vs. Copies:
Example code includes:
# Load the microbenchmark package
library(microbenchmark)
# Compare the timings for sorting different sizes of vector
mb <- microbenchmark(
# Sort a random normal vector length 1e5
"1e5" = sort(rnorm(1e5)),
# Sort a random normal vector length 2.5e5
"2.5e5" = sort(rnorm(2.5e5)),
# Sort a random normal vector length 5e5
"5e5" = sort(rnorm(5e5)),
"7.5e5" = sort(rnorm(7.5e5)),
"1e6" = sort(rnorm(1e6)),
times = 10
)
# Plot the resulting benchmark object
plot(mb)
# Load the bigmemory package
library(bigmemory)
# Create the big.matrix object: x
x <- read.big.matrix("./RInputFiles/mortgage-sample.csv", header = TRUE,
type = "integer",
backingfile = "mortgage-sample.bin",
descriptorfile = "mortgage-sample.desc")
# Find the dimensions of x
dim(x)
## [1] 70000 16
# Attach mortgage-sample.desc
mort <- attach.big.matrix("mortgage-sample.desc")
# Find the dimensions of mort
dim(mort)
## [1] 70000 16
# Look at the first 6 rows of mort
head(mort)
## enterprise record_number msa perc_minority tract_income_ratio
## [1,] 1 566 1 1 3
## [2,] 1 116 1 3 2
## [3,] 1 239 1 2 2
## [4,] 1 62 1 2 3
## [5,] 1 106 1 2 3
## [6,] 1 759 1 3 3
## borrower_income_ratio loan_purpose federal_guarantee borrower_race
## [1,] 1 2 4 3
## [2,] 1 2 4 5
## [3,] 3 8 4 5
## [4,] 3 2 4 5
## [5,] 3 2 4 9
## [6,] 2 2 4 9
## co_borrower_race borrower_gender co_borrower_gender num_units
## [1,] 9 2 4 1
## [2,] 9 1 4 1
## [3,] 5 1 2 1
## [4,] 9 2 4 1
## [5,] 9 3 4 1
## [6,] 9 1 2 2
## affordability year type
## [1,] 3 2010 1
## [2,] 3 2008 1
## [3,] 4 2014 0
## [4,] 4 2009 1
## [5,] 4 2013 1
## [6,] 4 2010 1
# Create mort
mort <- attach.big.matrix("mortgage-sample.desc")
# Look at the first 3 rows
mort[1:3, ]
## enterprise record_number msa perc_minority tract_income_ratio
## [1,] 1 566 1 1 3
## [2,] 1 116 1 3 2
## [3,] 1 239 1 2 2
## borrower_income_ratio loan_purpose federal_guarantee borrower_race
## [1,] 1 2 4 3
## [2,] 1 2 4 5
## [3,] 3 8 4 5
## co_borrower_race borrower_gender co_borrower_gender num_units
## [1,] 9 2 4 1
## [2,] 9 1 4 1
## [3,] 5 1 2 1
## affordability year type
## [1,] 3 2010 1
## [2,] 3 2008 1
## [3,] 4 2014 0
# Create a table of the number of mortgages for each year in the data set
table(mort[, "year"])
##
## 2008 2009 2010 2011 2012 2013 2014 2015
## 8468 11101 8836 7996 10935 10216 5714 6734
a <- getLoadedDLLs()
length(a)
## [1] 39
R.utils::gcDLLs()
## named list()
a <- getLoadedDLLs()
length(a)
## [1] 39
# Load the biganalytics package (error in loading to Knit file, works OK otherwise)
library(biganalytics)
## Loading required package: foreach
## Loading required package: biglm
## Loading required package: DBI
# Get the column means of mort
colmean(mort)
## enterprise record_number msa
## 1.3814571 499.9080571 0.8943571
## perc_minority tract_income_ratio borrower_income_ratio
## 1.9701857 2.3431571 2.6898857
## loan_purpose federal_guarantee borrower_race
## 3.7670143 3.9840857 5.3572429
## co_borrower_race borrower_gender co_borrower_gender
## 7.0002714 1.4590714 3.0494857
## num_units affordability year
## 1.0398143 4.2863429 2011.2714714
## type
## 0.5300429
# Use biganalytics' summary function to get a summary of the data
summary(mort)
## min max mean NAs
## enterprise 1.0000000 2.0000000 1.3814571 0.0000000
## record_number 0.0000000 999.0000000 499.9080571 0.0000000
## msa 0.0000000 1.0000000 0.8943571 0.0000000
## perc_minority 1.0000000 9.0000000 1.9701857 0.0000000
## tract_income_ratio 1.0000000 9.0000000 2.3431571 0.0000000
## borrower_income_ratio 1.0000000 9.0000000 2.6898857 0.0000000
## loan_purpose 1.0000000 9.0000000 3.7670143 0.0000000
## federal_guarantee 1.0000000 4.0000000 3.9840857 0.0000000
## borrower_race 1.0000000 9.0000000 5.3572429 0.0000000
## co_borrower_race 1.0000000 9.0000000 7.0002714 0.0000000
## borrower_gender 1.0000000 9.0000000 1.4590714 0.0000000
## co_borrower_gender 1.0000000 9.0000000 3.0494857 0.0000000
## num_units 1.0000000 4.0000000 1.0398143 0.0000000
## affordability 0.0000000 9.0000000 4.2863429 0.0000000
## year 2008.0000000 2015.0000000 2011.2714714 0.0000000
## type 0.0000000 1.0000000 0.5300429 0.0000000
# Use deepcopy() to create first_three
first_three <- deepcopy(mort, cols = 1:3,
backingfile = "first_three.bin",
descriptorfile = "first_three.desc")
# Set first_three_2 equal to first_three
first_three_2 <- first_three
# Set the value in the first row and first column of first_three to NA
first_three[1, 1] <- NA
# Verify the change shows up in first_three_2
first_three_2[1, 1]
## [1] NA
# but not in mort
mort[1, 1]
## [1] 1
Chapter 2 - Processing and Analyzing Data with bigmemory
The Bigmemory Suite of Packages:
Split-Apply-Combine (aka Split-Compute-Combine), run in this course using split() Map() Reduce():
Visualize results using tidyverse:
Limitations of bigmemory - process is useful for dense, numeric matrices that can be stored on hard disk:
Example code includes:
library(bigtabulate)
library(tidyr)
library(ggplot2)
library(biganalytics)
library(dplyr)
race_cat <- c('Native Am', 'Asian', 'Black', 'Pacific Is', 'White', 'Two or More', 'Hispanic', 'Not Avail')
# Call bigtable to create a variable called race_table
race_table <- bigtable(mort, "borrower_race")
# Rename the elements of race_table
names(race_table) <- race_cat
race_table
## Native Am Asian Black Pacific Is White Two or More
## 143 4438 2020 195 50006 528
## Hispanic Not Avail
## 4040 8630
# Create a table of the borrower race by year
race_year_table <- bigtable(mort, c("borrower_race", "year"))
# Convert rydf to a data frame
rydf <- as.data.frame(race_year_table)
# Create the new column Race
rydf$Race <- race_cat
# Let's see what it looks like
rydf
## 2008 2009 2010 2011 2012 2013 2014 2015 Race
## 1 11 18 13 16 15 12 29 29 Native Am
## 2 384 583 603 568 770 673 369 488 Asian
## 3 363 320 209 204 258 312 185 169 Black
## 4 33 38 21 13 28 22 17 23 Pacific Is
## 5 5552 7739 6301 5746 8192 7535 4110 4831 White
## 6 43 85 65 58 89 78 46 64 Two or More
## 7 577 563 384 378 574 613 439 512 Hispanic
## 9 1505 1755 1240 1013 1009 971 519 618 Not Avail
female_residence_prop <- function(x, rows) {
x_subset <- x[rows, ]
# Find the proporation of female borrowers in urban areas
prop_female_urban <- sum(x_subset[, "borrower_gender"] == 2 &
x_subset[, "msa"] == 1) /
sum(x_subset[, "msa"] == 1)
# Find the proporation of female borrowers in rural areas
prop_female_rural <- sum(x_subset[, "borrower_gender"] == 2 &
x_subset[, "msa"] == 0) /
sum(x_subset[, "msa"] == 0)
c(prop_female_urban, prop_female_rural)
}
# Find the proportion of female borrowers in 2015
female_residence_prop(mort, mort[, "year"] == 2015)
## [1] 0.2737439 0.2304965
# Split the row numbers of the mortage data by year
spl <- split(1:nrow(mort), mort[, "year"])
# Call str on spl
str(spl)
## List of 8
## $ 2008: int [1:8468] 2 8 15 17 18 28 35 40 42 47 ...
## $ 2009: int [1:11101] 4 13 25 31 43 49 52 56 67 68 ...
## $ 2010: int [1:8836] 1 6 7 10 21 23 24 27 29 38 ...
## $ 2011: int [1:7996] 11 20 37 46 53 57 73 83 86 87 ...
## $ 2012: int [1:10935] 14 16 26 30 32 33 48 69 81 94 ...
## $ 2013: int [1:10216] 5 9 19 22 36 44 55 58 72 74 ...
## $ 2014: int [1:5714] 3 12 50 60 64 66 103 114 122 130 ...
## $ 2015: int [1:6734] 34 41 54 61 62 65 82 91 102 135 ...
# For each of the row splits, find the female residence proportion
all_years <- Map(function(rows) female_residence_prop(mort, rows), spl)
# Call str on all_years
str(all_years)
## List of 8
## $ 2008: num [1:2] 0.275 0.204
## $ 2009: num [1:2] 0.244 0.2
## $ 2010: num [1:2] 0.241 0.201
## $ 2011: num [1:2] 0.252 0.241
## $ 2012: num [1:2] 0.244 0.21
## $ 2013: num [1:2] 0.275 0.257
## $ 2014: num [1:2] 0.289 0.268
## $ 2015: num [1:2] 0.274 0.23
# Collect the results as rows in a matrix
prop_female <- Reduce(rbind, all_years)
# Rename the row and column names
dimnames(prop_female) <- list(names(all_years), c("prop_female_urban", "prop_femal_rural"))
# View the matrix
prop_female
## prop_female_urban prop_femal_rural
## 2008 0.2748514 0.2039474
## 2009 0.2441074 0.2002978
## 2010 0.2413881 0.2014028
## 2011 0.2520644 0.2408931
## 2012 0.2438950 0.2101313
## 2013 0.2751059 0.2567164
## 2014 0.2886756 0.2678571
## 2015 0.2737439 0.2304965
# Convert prop_female to a data frame
prop_female_df <- as.data.frame(prop_female)
# Add a new column Year
prop_female_df$Year <- row.names(prop_female_df)
# Call gather on prop_female_df
prop_female_long <- gather(prop_female_df, Region, Prop, -Year)
# Create a line plot
ggplot(prop_female_long, aes(x = Year, y = Prop, group = Region, color = Region)) +
geom_line()
# Call summary on mort
summary(mort)
## min max mean NAs
## enterprise 1.0000000 2.0000000 1.3814571 0.0000000
## record_number 0.0000000 999.0000000 499.9080571 0.0000000
## msa 0.0000000 1.0000000 0.8943571 0.0000000
## perc_minority 1.0000000 9.0000000 1.9701857 0.0000000
## tract_income_ratio 1.0000000 9.0000000 2.3431571 0.0000000
## borrower_income_ratio 1.0000000 9.0000000 2.6898857 0.0000000
## loan_purpose 1.0000000 9.0000000 3.7670143 0.0000000
## federal_guarantee 1.0000000 4.0000000 3.9840857 0.0000000
## borrower_race 1.0000000 9.0000000 5.3572429 0.0000000
## co_borrower_race 1.0000000 9.0000000 7.0002714 0.0000000
## borrower_gender 1.0000000 9.0000000 1.4590714 0.0000000
## co_borrower_gender 1.0000000 9.0000000 3.0494857 0.0000000
## num_units 1.0000000 4.0000000 1.0398143 0.0000000
## affordability 0.0000000 9.0000000 4.2863429 0.0000000
## year 2008.0000000 2015.0000000 2011.2714714 0.0000000
## type 0.0000000 1.0000000 0.5300429 0.0000000
bir_df_wide <- bigtable(mort, c("borrower_income_ratio", "year")) %>%
as.data.frame() %>%
tibble::rownames_to_column() %>%
filter(rowname %in% c(1, 2, 3)) %>%
select(-rowname) %>%
# Create a new column called BIR with the corresponding table categories
mutate(BIR = c(">=0,<=50%", ">50, <=80%", ">80%"))
bir_df_wide
## 2008 2009 2010 2011 2012 2013 2014 2015 BIR
## 1 1205 1473 600 620 745 725 401 380 >=0,<=50%
## 2 2095 2791 1554 1421 1819 1861 1032 1145 >50, <=80%
## 3 4844 6707 6609 5934 8338 7559 4255 5169 >80%
bir_df_wide %>%
# Transform the wide-formatted data.frame into the long format
gather(Year, Count, -BIR) %>%
# Use ggplot to create a line plot
ggplot(aes(x = Year, y = Count, group = BIR, color = BIR)) +
geom_line()
Chapter 3 - Working with iotools
Introduction to chunk-wise processing - solution to challenges from bigmemory:
First look at iotools: Importing data:
Using chunk.apply - effectively moves away from what is functionally a “for loop” to allow better parallel processing:
Example code includes:
foldable_range <- function(x) {
if (is.list(x)) {
# If x is a list then reduce it by the min and max of each element in the list
c(Reduce(min, x), Reduce(max, x))
} else {
# Otherwise, assume it's a vector and find it's range
range(x)
}
}
# Verify that foldable_range() works on the record_number column
foldable_range(mort[, "record_number"])
## [1] 0 999
# Split the mortgage data by year
spl <- split(1:nrow(mort), mort[, "year"])
# Use foldable_range() to get the range of the record numbers
foldable_range(Map(function(s) foldable_range(mort[s, "record_number"]), spl))
## [1] 0 999
# Load the iotools and microbenchmark packages
library(iotools)
library(microbenchmark)
# Time the reading of files
microbenchmark(
# Time the reading of a file using read.delim five times
read.delim("./RInputFiles/mortgage-sample.csv", header = FALSE, sep = ","),
# Time the reading of a file using read.delim.raw five times
read.delim.raw("./RInputFiles/mortgage-sample.csv", header = FALSE, sep = ","),
times = 5
)
## Unit: milliseconds
## expr
## read.delim("./RInputFiles/mortgage-sample.csv", header = FALSE, sep = ",")
## read.delim.raw("./RInputFiles/mortgage-sample.csv", header = FALSE, sep = ",")
## min lq mean median uq max neval cld
## 308.05821 308.90653 358.39916 364.51906 383.3692 427.1428 5 b
## 69.16828 73.25159 84.04953 76.82922 96.4140 104.5846 5 a
# Read mortgage-sample.csv as a raw vector
raw_file_content <- readAsRaw("./RInputFiles/mortgage-sample.csv")
# Convert the raw vector contents to a matrix
mort_mat <- mstrsplit(raw_file_content, sep = ",", type = "integer", skip = 1)
# Look at the first 6 rows
head(mort_mat)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
## [1,] 1 566 1 1 3 1 2 4 3 9 2 4 1
## [2,] 1 116 1 3 2 1 2 4 5 9 1 4 1
## [3,] 1 239 1 2 2 3 8 4 5 5 1 2 1
## [4,] 1 62 1 2 3 3 2 4 5 9 2 4 1
## [5,] 1 106 1 2 3 3 2 4 9 9 3 4 1
## [6,] 1 759 1 3 3 2 2 4 9 9 1 2 2
## [,14] [,15] [,16]
## [1,] 3 2010 1
## [2,] 3 2008 1
## [3,] 4 2014 0
## [4,] 4 2009 1
## [5,] 4 2013 1
## [6,] 4 2010 1
# Convert the raw file contents to a data.frame
mort_df <- dstrsplit(raw_file_content, sep = ",", col_types = rep("integer", 16), skip = 1)
# Look at the first 6 rows
head(mort_df)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16
## 1 1 566 1 1 3 1 2 4 3 9 2 4 1 3 2010 1
## 2 1 116 1 3 2 1 2 4 5 9 1 4 1 3 2008 1
## 3 1 239 1 2 2 3 8 4 5 5 1 2 1 4 2014 0
## 4 1 62 1 2 3 3 2 4 5 9 2 4 1 4 2009 1
## 5 1 106 1 2 3 3 2 4 9 9 3 4 1 4 2013 1
## 6 1 759 1 3 3 2 2 4 9 9 1 2 2 4 2010 1
# We have created a file connection fc to the "mortgage-sample.csv" file and read in the first line to get rid of the header.
# Define the function to apply to each chunk
make_table <- function(chunk) {
# Read each chunk as a matrix
x <- mstrsplit(chunk, type = "integer", sep = ",")
# Create a table of the number of borrowers (column 3) for each chunk
table(x[, 3])
}
# Create a file connection to mortgage-sample.csv
fc <- file("./RInputFiles/mortgage-sample.csv", "rb")
# Read the first line to get rid of the header
(col_names <- readLines(fc, n = 1))
## [1] "\"enterprise\",\"record_number\",\"msa\",\"perc_minority\",\"tract_income_ratio\",\"borrower_income_ratio\",\"loan_purpose\",\"federal_guarantee\",\"borrower_race\",\"co_borrower_race\",\"borrower_gender\",\"co_borrower_gender\",\"num_units\",\"affordability\",\"year\",\"type\""
(col_names <- lapply(str_split(col_names, '\\",\\"'), FUN=function(x) { str_replace(x, '\\"', '') })[[1]])
## [1] "enterprise" "record_number"
## [3] "msa" "perc_minority"
## [5] "tract_income_ratio" "borrower_income_ratio"
## [7] "loan_purpose" "federal_guarantee"
## [9] "borrower_race" "co_borrower_race"
## [11] "borrower_gender" "co_borrower_gender"
## [13] "num_units" "affordability"
## [15] "year" "type"
# Read the data in chunks
counts <- chunk.apply(fc, make_table, CH.MAX.SIZE = 1e5)
# Close the file connection
close(fc)
# Print counts
counts
## 0 1
## [1,] 309 2401
## [2,] 289 2422
## [3,] 266 2444
## [4,] 300 2410
## [5,] 279 2431
## [6,] 310 2400
## [7,] 274 2436
## [8,] 283 2428
## [9,] 259 2452
## [10,] 287 2423
## [11,] 288 2423
## [12,] 283 2428
## [13,] 271 2439
## [14,] 299 2411
## [15,] 294 2416
## [16,] 305 2405
## [17,] 280 2431
## [18,] 275 2435
## [19,] 303 2407
## [20,] 279 2431
## [21,] 296 2414
## [22,] 294 2417
## [23,] 288 2424
## [24,] 264 2446
## [25,] 292 2418
## [26,] 228 2013
# Sum up the chunks
colSums(counts)
## 0 1
## 7395 62605
msa_map <- c("rural", "urban")
# Define the function to apply to each chunk
make_msa_table <- function(chunk) {
# Read each chunk as a data frame
x <- dstrsplit(chunk, col_types = rep("integer", length(col_names)), sep = ",")
# Set the column names of the data frame that's been read
colnames(x) <- col_names
# Create new column, msa_pretty, with a string description of where the borrower lives
x$msa_pretty <- msa_map[x$msa + 1]
# Create a table from the msa_pretty column
table(x$msa_pretty)
}
# Create a file connection to mortgage-sample.csv
fc <- file("./RInputFiles/mortgage-sample.csv", "rb")
# Read the first line to get rid of the header
readLines(fc, n = 1)
## [1] "\"enterprise\",\"record_number\",\"msa\",\"perc_minority\",\"tract_income_ratio\",\"borrower_income_ratio\",\"loan_purpose\",\"federal_guarantee\",\"borrower_race\",\"co_borrower_race\",\"borrower_gender\",\"co_borrower_gender\",\"num_units\",\"affordability\",\"year\",\"type\""
# Read the data in chunks
counts <- chunk.apply(fc, make_msa_table, CH.MAX.SIZE = 1e5)
# Close the file connection
close(fc)
# Aggregate the counts as before
colSums(counts)
## rural urban
## 7395 62605
iotools_read_fun <- function(parallel) {
fc <- file("./RInputFiles/mortgage-sample.csv", "rb")
readLines(fc, n = 1)
chunk.apply(fc, make_msa_table,
CH.MAX.SIZE = 1e5, parallel = parallel)
close(fc)
}
# Benchmark the new function
microbenchmark(
# Use one process
iotools_read_fun(1),
# Use three processes
iotools_read_fun(3),
times = 20
)
## Unit: milliseconds
## expr min lq mean median uq max
## iotools_read_fun(1) 108.2739 117.4746 140.8637 127.5814 147.8720 262.3306
## iotools_read_fun(3) 106.7766 117.0840 125.0748 123.8654 131.4695 148.9044
## neval cld
## 20 a
## 20 a
Chapter 4 - Case Study: Preliminary Analysis of Housing Data
Overview of types of analysis for this chapter:
Are the data missing at random?
Analyzing the Housing Data:
Borrower Lending Trends: City vs. Rural:
Wrap up:
Example code includes:
# Create a table of borrower_race column
race_table <- bigtable(mort, "borrower_race")
# Rename the elements
names(race_table) <- race_cat[as.numeric(names(race_table))]
# Find the proportion
race_table[1:7] / sum(race_table[1:7])
## Native Am Asian Black Pacific Is White Two or More
## 0.002330129 0.072315464 0.032915105 0.003177448 0.814828092 0.008603552
## Hispanic
## 0.065830210
mort_names <- col_names
# Create table of the borrower_race
race_table_chunks <- chunk.apply(
"./RInputFiles/mortgage-sample.csv", function(chunk) {
x <- mstrsplit(chunk, sep = ",", type = "integer")
colnames(x) <- mort_names
table(x[, "borrower_race"])
}, CH.MAX.SIZE = 1e5)
# Add up the columns
race_table <- colSums(race_table_chunks)
# Find the proportion
borrower_proportion <- race_table[1:7] / sum(race_table[1:7])
pop_proportion <- c(0.009, 0.048, 0.126, 0.002, 0.724, 0.029, 0.163)
names(pop_proportion) <- race_cat[1:7]
# Create the matrix
matrix(c(pop_proportion, borrower_proportion), byrow = TRUE, nrow = 2,
dimnames = list(c("Population Proportion", "Borrower Proportion"), race_cat[1:7]))
## Native Am Asian Black Pacific Is
## Population Proportion 0.009000000 0.04800000 0.12600000 0.002000000
## Borrower Proportion 0.002330129 0.07231546 0.03291511 0.003177448
## White Two or More Hispanic
## Population Proportion 0.7240000 0.029000000 0.16300000
## Borrower Proportion 0.8148281 0.008603552 0.06583021
# Create a variable indicating if borrower_race is missing in the mortgage data
borrower_race_ind <- mort[, "borrower_race"] == 9
# Create a factor variable indicating the affordability
affordability_factor <- factor(mort[, "affordability"])
# Perform a logistic regression
summary(glm(borrower_race_ind ~ affordability_factor, family = binomial))
##
## Call:
## glm(formula = borrower_race_ind ~ affordability_factor, family = binomial)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5969 -0.5016 -0.5016 -0.5016 2.0867
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.7478 0.1376 -12.701 <2e-16 ***
## affordability_factor1 -0.2241 0.1536 -1.459 0.1447
## affordability_factor2 -0.3090 0.1609 -1.920 0.0548 .
## affordability_factor3 -0.2094 0.1446 -1.448 0.1476
## affordability_factor4 -0.2619 0.1383 -1.894 0.0582 .
## affordability_factor9 0.1131 0.1413 0.800 0.4235
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 52279 on 69999 degrees of freedom
## Residual deviance: 52166 on 69994 degrees of freedom
## AIC: 52178
##
## Number of Fisher Scoring iterations: 4
# Open a connection to the file and skip the header
fc <- file("./RInputFiles/mortgage-sample.csv", "rb")
readLines(fc, n = 1)
## [1] "\"enterprise\",\"record_number\",\"msa\",\"perc_minority\",\"tract_income_ratio\",\"borrower_income_ratio\",\"loan_purpose\",\"federal_guarantee\",\"borrower_race\",\"co_borrower_race\",\"borrower_gender\",\"co_borrower_gender\",\"num_units\",\"affordability\",\"year\",\"type\""
# Create a function to read chunks
make_table <- function(chunk) {
# Create a matrix
m <- mstrsplit(chunk, sep = ",", type = "integer")
colnames(m) <- mort_names
# Create the output table
bigtable(m, c("borrower_race", "year"))
}
# Import data using chunk.apply
race_year_table <- chunk.apply(fc, make_table)
# Close connection
close(fc)
# Cast it to a data frame
rydf <- as.data.frame(race_year_table)
# Create a new column Race with race/ethnicity
rydf$Race <- race_cat
# Note: We removed the row corresponding to "Not Avail".
# View rydf
rydf <-
rydf %>%
filter(Race !="Not Avail")
rydf
## 2008 2009 2010 2011 2012 2013 2014 2015 Race
## 1 11 18 13 16 15 12 29 29 Native Am
## 2 384 583 603 568 770 673 369 488 Asian
## 3 363 320 209 204 258 312 185 169 Black
## 4 33 38 21 13 28 22 17 23 Pacific Is
## 5 5552 7739 6301 5746 8192 7535 4110 4831 White
## 6 43 85 65 58 89 78 46 64 Two or More
## 7 577 563 384 378 574 613 439 512 Hispanic
# View pop_proportion
pop_proportion
## Native Am Asian Black Pacific Is White Two or More
## 0.009 0.048 0.126 0.002 0.724 0.029
## Hispanic
## 0.163
# Gather on all variables except Race
rydfl <- gather(rydf, Year, Count, -Race)
# Create a new adjusted count variable
rydfl$Adjusted_Count <- rydfl$Count / pop_proportion[rydfl$Race]
# Plot
ggplot(rydfl, aes(x = Year, y = Adjusted_Count, group = Race, color = Race)) +
geom_line()
# View rydf
rydf
## 2008 2009 2010 2011 2012 2013 2014 2015 Race
## 1 11 18 13 16 15 12 29 29 Native Am
## 2 384 583 603 568 770 673 369 488 Asian
## 3 363 320 209 204 258 312 185 169 Black
## 4 33 38 21 13 28 22 17 23 Pacific Is
## 5 5552 7739 6301 5746 8192 7535 4110 4831 White
## 6 43 85 65 58 89 78 46 64 Two or More
## 7 577 563 384 378 574 613 439 512 Hispanic
# Normalize the columns
for (i in seq_len(nrow(rydf))) {
rydf[i, 1:8] <- rydf[i, 1:8] / rydf[i, 1]
}
# Convert the data to long format
rydf_long <- gather(rydf, Year, Proportion, -Race)
# Plot
ggplot(rydf_long, aes(x = Year, y = Proportion, group = Race, color = Race)) +
geom_line()
# Open a connection to the file and skip the header
fc <- file("./RInputFiles/mortgage-sample.csv", "rb")
readLines(fc, n = 1)
## [1] "\"enterprise\",\"record_number\",\"msa\",\"perc_minority\",\"tract_income_ratio\",\"borrower_income_ratio\",\"loan_purpose\",\"federal_guarantee\",\"borrower_race\",\"co_borrower_race\",\"borrower_gender\",\"co_borrower_gender\",\"num_units\",\"affordability\",\"year\",\"type\""
# Create a function to read chunks
make_table <- function(chunk) {
# Create a matrix
m <- mstrsplit(chunk, sep = ",", type = "integer")
colnames(m) <- mort_names
# Create the output table
bigtable(m, c("msa", "year"))
}
# Import data using chunk.apply
msa_year_table <- chunk.apply(fc, make_table)
# Close connection
close(fc)
# Convert to a data frame
df_msa <- as.data.frame(msa_year_table)
# Rename columns
df_msa$MSA <- c("rural", "city")
# Gather on all columns except Year
df_msa_long <- gather(df_msa, Year, Count, -MSA)
# Plot
ggplot(df_msa_long, aes(x = Year, y = Count, group = MSA, color = MSA)) +
geom_line()
# Tabulate borrower_income_ratio and federal_guarantee
ir_by_fg <- bigtable(mort, c("borrower_income_ratio", "federal_guarantee"))
# Label the columns and rows of the table
income_cat <- c('0 <= 50', '50 < 80', '> 80', 'Not Applicable')
guarantee_cat <- c('FHA/VA', 'RHS', 'HECM', 'No Guarantee')
dimnames(ir_by_fg) <- list(income_cat, guarantee_cat)
# For each row in ir_by_fg, divide by the sum of the row
for (i in seq_len(nrow(ir_by_fg))) {
ir_by_fg[i, ] = ir_by_fg[i, ] / sum(ir_by_fg[i, ])
}
# Print
ir_by_fg
## FHA/VA RHS HECM No Guarantee
## 0 <= 50 0.008944544 0.0014636526 0.0443974630 0.9451943
## 50 < 80 0.005977548 0.0024055985 0.0026971862 0.9889197
## > 80 0.001113022 0.0002428412 0.0006475766 0.9979966
## Not Applicable 0.023676880 0.0013927577 0.0487465181 0.9261838
# Quirky fix so that the files can be used again later
rm(mort)
rm(x)
rm(first_three)
rm(first_three_2)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1882288 100.6 3205452 171.2 3205452 171.2
## Vcells 12859420 98.2 20680586 157.8 17908220 136.7
Chapter 1 - Downloading Files and Using API Clients
Introduction: Working with Web Data in R:
Understanding Application Programming Interfaces (API) - automatically handling data changes:
Access tokens and API:
Example code includes:
# Here are the URLs! As you can see they're just normal strings
csv_url <- "http://s3.amazonaws.com/assets.datacamp.com/production/course_1561/datasets/chickwts.csv"
tsv_url <- "http://s3.amazonaws.com/assets.datacamp.com/production/course_3026/datasets/tsv_data.tsv"
# Read a file in from the CSV URL and assign it to csv_data
csv_data <- read.csv(csv_url)
# Read a file in from the TSV URL and assign it to tsv_data
tsv_data <- read.delim(tsv_url)
# Examine the objects with head()
head(csv_data)
## weight feed
## 1 179 horsebean
## 2 160 horsebean
## 3 136 horsebean
## 4 227 horsebean
## 5 217 horsebean
## 6 168 horsebean
head(tsv_data)
## weight feed
## 1 179 horsebean
## 2 160 horsebean
## 3 136 horsebean
## 4 227 horsebean
## 5 217 horsebean
## 6 168 horsebean
# Download the file with download.file()
download.file(url = csv_url, destfile = "./RInputFiles/feed_data.csv")
# Read it in with read.csv()
csv_data <- read.csv("./RInputFiles/feed_data.csv")
# Add a new column: square_weight
csv_data$square_weight <- csv_data$weight ** 2
# Save it to disk with saveRDS()
saveRDS(csv_data, "./RInputFiles/modified_feed_data.RDS")
# Read it back in with readRDS()
modified_feed_data <- readRDS("./RInputFiles/modified_feed_data.RDS")
# Examine modified_feed_data
str(modified_feed_data)
## 'data.frame': 71 obs. of 3 variables:
## $ weight : int 179 160 136 227 217 168 108 124 143 140 ...
## $ feed : Factor w/ 6 levels "casein","horsebean",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ square_weight: num 32041 25600 18496 51529 47089 ...
# Load pageviews
# library(pageviews)
# Get the pageviews for "Hadley Wickham"
hadley_pageviews <- pageviews::article_pageviews(project = "en.wikipedia", "Hadley Wickham")
# Examine the resulting object
str(hadley_pageviews)
## 'data.frame': 1 obs. of 8 variables:
## $ project : chr "wikipedia"
## $ language : chr "en"
## $ article : chr "Hadley_Wickham"
## $ access : chr "all-access"
## $ agent : chr "all-agents"
## $ granularity: chr "daily"
## $ date : POSIXct, format: "2015-10-01"
## $ views : num 53
# Load birdnik
# library(birdnik)
# Get the word frequency for "vector", using api_key to access it
# vector_frequency <- word_frequency(api_key, "vector")
Chapter 2 - Using httr to interact with API Directly
GET and POST requests in theory - https and web requests in theory:
Graceful httr - code that responds appropriately and constructs its own url:
Respectful API Usage - usage that works for the API owners as well as the clients:
Example code includes:
# Load the httr package
library(httr)
# Make a GET request to http://httpbin.org/get
get_result <- GET("http://httpbin.org/get")
# Print it to inspect it
# get_result
# Make a POST request to http://httpbin.org/post with the body "this is a test"
# post_result <- POST(url="http://httpbin.org/post", body="this is a test")
# Print it to inspect it
# post_result
url <- "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/all-access/all-agents/Hadley_Wickham/daily/20170101/20170102"
# Make a GET request to url and save the results
pageview_response <- GET(url)
# Call content() to retrieve the data the server sent back
pageview_data <- content(pageview_response)
# Examine the results with str()
str(pageview_data)
## List of 1
## $ items:List of 2
## ..$ :List of 7
## .. ..$ project : chr "en.wikipedia"
## .. ..$ article : chr "Hadley_Wickham"
## .. ..$ granularity: chr "daily"
## .. ..$ timestamp : chr "2017010100"
## .. ..$ access : chr "all-access"
## .. ..$ agent : chr "all-agents"
## .. ..$ views : int 45
## ..$ :List of 7
## .. ..$ project : chr "en.wikipedia"
## .. ..$ article : chr "Hadley_Wickham"
## .. ..$ granularity: chr "daily"
## .. ..$ timestamp : chr "2017010200"
## .. ..$ access : chr "all-access"
## .. ..$ agent : chr "all-agents"
## .. ..$ views : int 86
fake_url <- "http://google.com/fakepagethatdoesnotexist"
# Make the GET request
request_result <- GET(fake_url)
# Check request_result
if(http_error(request_result)){
warning("The request failed")
} else {
content(request_result)
}
## Warning: The request failed
# Construct a directory-based API URL to `http://swapi.co/api`,
# looking for person `1` in `people`
directory_url <- paste("http://swapi.co/api", "people", 1, sep = "/")
# Make a GET call with it
result <- GET(directory_url)
# Create list with nationality and country elements
query_params <- list(nationality = "americans",
country = "antigua")
# Make parameter-based call to httpbin, with query_params
parameter_response <- GET("https://httpbin.org/get", query = query_params)
# Print parameter_response
parameter_response
## Response [https://httpbin.org/get?nationality=americans&country=antigua]
## Date: 2018-02-16 13:02
## Status: 200
## Content-Type: application/json
## Size: 425 B
## {
## "args": {
## "country": "antigua",
## "nationality": "americans"
## },
## "headers": {
## "Accept": "application/json, text/xml, application/xml, */*",
## "Accept-Encoding": "gzip, deflate",
## "Connection": "close",
## "Host": "httpbin.org",
## ...
# Do not change the url
# url <- "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/Aaron_Halfaker/daily/2015100100/2015103100"
# Add the email address and the test sentence inside user_agent()
# server_response <- GET(url, user_agent("my@email.address this is a test"))
# Construct a vector of 2 URLs
urls <- c("http://fakeurl.com/api/1.0/", "http://fakeurl.com/api/2.0/")
for(url in urls){
# Send a GET request to url
result <- GET(url)
# Delay for 5 seconds between requests
Sys.sleep(1)
}
get_pageviews <- function(article_title){
url <- paste0("https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents", article_title, "daily/2015100100/2015103100", sep = "/")
response <- GET(url, user_agent("my@email.com this is a test"))
if(http_error(response)){
stop("the request failed" )
} else {
result <- content(response)
return(result)
}
}
Chapter 3 - Handling JSON and XML
JSON is a dictionary-like format (plain text) foe sending data on the internet:
Manipulating JSON - lists are the natural R hierarchy for JSON:
XML Structure - plain text like JSON, but with a very different structure:
XPATH - language for specifying nodes in an XML document:
Example code includes:
rev_history <- function(title, format = "json"){
if (title != "Hadley Wickham") {
stop('rev_history() only works for `title = "Hadley Wickham"`')
}
if (format == "json"){
resp <- readRDS("had_rev_json.rds")
} else if (format == "xml"){
resp <- readRDS("had_rev_xml.rds")
} else {
stop('Invalid format supplied, try "json" or "xml"')
}
resp
}
test_json <- "{\"continue\":{\"rvcontinue\":\"20150528042700|664370232\",\"continue\":\"||\"},\"query\":{\"pages\":{\"41916270\":{\"pageid\":41916270,\"ns\":0,\"title\":\"Hadley Wickham\",\"revisions\":[{\"user\":\"214.28.226.251\",\"anon\":\"\",\"timestamp\":\"2015-01-14T17:12:45Z\",\"comment\":\"\",\"contentformat\":\"text/x-wiki\",\"contentmodel\":\"wikitext\",\"*\":\"'''Hadley Mary Helen Wickham III''' is a [[statistician]] from [[New Zealand]] who is currently Chief Scientist at [[RStudio]]<ref>{{cite web|url=http://washstat.org/wss1310.shtml |title=Washington Statistical Society October 2013 Newsletter |publisher=Washstat.org |date= |accessdate=2014-02-12}}</ref><ref>{{cite web|url=http://news.idg.no/cw/art.cfm?id=F66B12BB-D13E-94B0-DAA22F5AB01BEFE7 |title=60+ R resources to improve your data skills ( - Software ) |publisher=News.idg.no |date= |accessdate=2014-02-12}}</ref> and an [[Professors_in_the_United_States#Adjunct_professor|adjunct]] [[Assistant Professor]] of statistics at [[Rice University]].<ref name=\\\"about\\\">{{cite web|url=http://www.rstudio.com/about/ |title=About - RStudio |accessdate=2014-08-13}}</ref> He is best known for his development of open-source statistical analysis software packages for [[R (programming language)]] that implement logics of [[data visualisation]] and data transformation. Wickham completed his undergraduate studies at the [[University of Auckland]] and his PhD at [[Iowa State University]] under the supervision of Di Cook and Heike Hoffman.<ref>{{cite web|URL=http://blog.revolutionanalytics.com/2010/09/the-r-files-hadley-wickham.html |title= The R-Files: Hadley Wickham}}</ref> In 2006 he was awarded the [[John_Chambers_(statistician)|John Chambers]] Award for Statistical Computing for his work developing tools for data reshaping and visualisation.<ref>{{cite web|url=http://stat-computing.org/awards/jmc/winners.html |title=John Chambers Award Past winners|publisher=ASA Sections on Statistical Computing, Statistical Graphics,|date= |accessdate=2014-08-12}}</ref>\\n\\nHe is a prominent and active member of the [[R (programming language)|R]] user community and has developed several notable and widely used packages including [[ggplot2]], plyr, dplyr, and reshape2.<ref name=\\\"about\\\" /><ref>{{cite web|url=http://www.r-statistics.com/2013/06/top-100-r-packages-for-2013-jan-may/ |title=Top 100 R Packages for 2013 (Jan-May)! |publisher=R-statistics blog |date= |accessdate=2014-08-12}}</ref>\"},{\"user\":\"73.183.151.193\",\"anon\":\"\",\"timestamp\":\"2015-01-15T15:49:34Z\",\"comment\":\"\",\"contentformat\":\"text/x-wiki\",\"contentmodel\":\"wikitext\",\"*\":\"'''Hadley Wickham''' is a [[statistician]] from [[New Zealand]] who is currently Chief Scientist at [[RStudio]]<ref>{{cite web|url=http://washstat.org/wss1310.shtml |title=Washington Statistical Society October 2013 Newsletter |publisher=Washstat.org |date= |accessdate=2014-02-12}}</ref><ref>{{cite web|url=http://news.idg.no/cw/art.cfm?id=F66B12BB-D13E-94B0-DAA22F5AB01BEFE7 |title=60+ R resources to improve your data skills ( - Software ) |publisher=News.idg.no |date= |accessdate=2014-02-12}}</ref> and an [[Professors_in_the_United_States#Adjunct_professor|adjunct]] [[Assistant Professor]] of statistics at [[Rice University]].<ref name=\\\"about\\\">{{cite web|url=http://www.rstudio.com/about/ |title=About - RStudio |accessdate=2014-08-13}}</ref> He is best known for his development of open-source statistical analysis software packages for [[R (programming language)]] that implement logics of [[data visualisation]] and data transformation. Wickham completed his undergraduate studies at the [[University of Auckland]] and his PhD at [[Iowa State University]] under the supervision of Di Cook and Heike Hoffman.<ref>{{cite web|URL=http://blog.revolutionanalytics.com/2010/09/the-r-files-hadley-wickham.html |title= The R-Files: Hadley Wickham}}</ref> In 2006 he was awarded the [[John_Chambers_(statistician)|John Chambers]] Award for Statistical Computing for his work developing tools for data reshaping and visualisation.<ref>{{cite web|url=http://stat-computing.org/awards/jmc/winners.html |title=John Chambers Award Past winners|publisher=ASA Sections on Statistical Computing, Statistical Graphics,|date= |accessdate=2014-08-12}}</ref>\\n\\nHe is a prominent and active member of the [[R (programming language)|R]] user community and has developed several notable and widely used packages including [[ggplot2]], plyr, dplyr, and reshape2.<ref name=\\\"about\\\" /><ref>{{cite web|url=http://www.r-statistics.com/2013/06/top-100-r-packages-for-2013-jan-may/ |title=Top 100 R Packages for 2013 (Jan-May)! |publisher=R-statistics blog |date= |accessdate=2014-08-12}}</ref>\"},{\"user\":\"FeanorStar7\",\"timestamp\":\"2015-01-24T16:34:31Z\",\"comment\":\"/* External links */ add LCCN and cats\",\"contentformat\":\"text/x-wiki\",\"contentmodel\":\"wikitext\",\"*\":\"'''Hadley Wickham''' is a [[statistician]] from [[New Zealand]] who is currently Chief Scientist at [[RStudio]]<ref>{{cite web|url=http://washstat.org/wss1310.shtml |title=Washington Statistical Society October 2013 Newsletter |publisher=Washstat.org |date= |accessdate=2014-02-12}}</ref><ref>{{cite web|url=http://news.idg.no/cw/art.cfm?id=F66B12BB-D13E-94B0-DAA22F5AB01BEFE7 |title=60+ R resources to improve your data skills ( - Software ) |publisher=News.idg.no |date= |accessdate=2014-02-12}}</ref> and an [[Professors_in_the_United_States#Adjunct_professor|adjunct]] [[Assistant Professor]] of statistics at [[Rice University]].<ref name=\\\"about\\\">{{cite web|url=http://www.rstudio.com/about/ |title=About - RStudio |accessdate=2014-08-13}}</ref> He is best known for his development of open-source statistical analysis software packages for [[R (programming language)]] that implement logics of [[data visualisation]] and data transformation. Wickham completed his undergraduate studies at the [[University of Auckland]] and his PhD at [[Iowa State University]] under the supervision of Di Cook and Heike Hoffman.<ref>{{cite web|URL=http://blog.revolutionanalytics.com/2010/09/the-r-files-hadley-wickham.html |title= The R-Files: Hadley Wickham}}</ref> In 2006 he was awarded the [[John_Chambers_(statistician)|John Chambers]] Award for Statistical Computing for his work developing tools for data reshaping and visualisation.<ref>{{cite web|url=http://stat-computing.org/awards/jmc/winners.html |title=John Chambers Award Past winners|publisher=ASA Sections on Statistical Computing, Statistical Graphics,|date= |accessdate=2014-08-12}}</ref>\\n\\nHe is a prominent and active member of the [[R (programming language)|R]] user community and has developed several notable and widely used packages including [[ggplot2]], plyr, dplyr, and reshape2.<ref name=\\\"about\\\" /><ref>{{cite web|url=http://www.r-statistics.com/2013/06/top-100-r-packages-for-2013-jan-may/ |title=Top 100 R Packages for 2013 (Jan-May)! |publisher=R-statistics blog |date= |accessdate=2014-08-12}}</ref>\"},{\"user\":\"KasparBot\",\"timestamp\":\"2015-04-26T19:18:17Z\",\"comment\":\"authority control moved to wikidata\",\"contentformat\":\"text/x-wiki\",\"contentmodel\":\"wikitext\",\"*\":\"'''Hadley Wickham''' is a [[statistician]] from [[New Zealand]] who is currently Chief Scientist at [[RStudio]]<ref>{{cite web|url=http://washstat.org/wss1310.shtml |title=Washington Statistical Society October 2013 Newsletter |publisher=Washstat.org |date= |accessdate=2014-02-12}}</ref><ref>{{cite web|url=http://news.idg.no/cw/art.cfm?id=F66B12BB-D13E-94B0-DAA22F5AB01BEFE7 |title=60+ R resources to improve your data skills ( - Software ) |publisher=News.idg.no |date= |accessdate=2014-02-12}}</ref> and an [[Professors_in_the_United_States#Adjunct_professor|adjunct]] [[Assistant Professor]] of statistics at [[Rice University]].<ref name=\\\"about\\\">{{cite web|url=http://www.rstudio.com/about/ |title=About - RStudio |accessdate=2014-08-13}}</ref> He is best known for his development of open-source statistical analysis software packages for [[R (programming language)]] that implement logics of [[data visualisation]] and data transformation. Wickham completed his undergraduate studies at the [[University of Auckland]] and his PhD at [[Iowa State University]] under the supervision of Di Cook and Heike Hoffman.<ref>{{cite web|URL=http://blog.revolutionanalytics.com/2010/09/the-r-files-hadley-wickham.html |title= The R-Files: Hadley Wickham}}</ref> In 2006 he was awarded the [[John_Chambers_(statistician)|John Chambers]] Award for Statistical Computing for his work developing tools for data reshaping and visualisation.<ref>{{cite web|url=http://stat-computing.org/awards/jmc/winners.html |title=John Chambers Award Past winners|publisher=ASA Sections on Statistical Computing, Statistical Graphics,|date= |accessdate=2014-08-12}}</ref>\\n\\nHe is a prominent and active member of the [[R (programming language)|R]] user community and has developed several notable and widely used packages including [[ggplot2]], plyr, dplyr, and reshape2.<ref name=\\\"about\\\" /><ref>{{cite web|url=http://www.r-statistics.com/2013/06/top-100-r-packages-for-2013-jan-may/ |title=Top 100 R Packages for 2013 (Jan-May)! |publisher=R-statistics blog |date= |accessdate=2014-08-12}}</ref>\"},{\"user\":\"Spkal\",\"timestamp\":\"2015-05-06T18:24:57Z\",\"comment\":\"/* Bibliography */ Added his new book, R Packages\",\"contentformat\":\"text/x-wiki\",\"contentmodel\":\"wikitext\",\"*\":\"'''Hadley Wickham''' is a [[statistician]] from [[New Zealand]] who is currently Chief Scientist at [[RStudio]]<ref>{{cite web|url=http://washstat.org/wss1310.shtml |title=Washington Statistical Society October 2013 Newsletter |publisher=Washstat.org |date= |accessdate=2014-02-12}}</ref><ref>{{cite web|url=http://news.idg.no/cw/art.cfm?id=F66B12BB-D13E-94B0-DAA22F5AB01BEFE7 |title=60+ R resources to improve your data skills ( - Software ) |publisher=News.idg.no |date= |accessdate=2014-02-12}}</ref> and an [[Professors_in_the_United_States#Adjunct_professor|adjunct]] [[Assistant Professor]] of statistics at [[Rice University]].<ref name=\\\"about\\\">{{cite web|url=http://www.rstudio.com/about/ |title=About - RStudio |accessdate=2014-08-13}}</ref> He is best known for his development of open-source statistical analysis software packages for [[R (programming language)]] that implement logics of [[data visualisation]] and data transformation. Wickham completed his undergraduate studies at the [[University of Auckland]] and his PhD at [[Iowa State University]] under the supervision of Di Cook and Heike Hoffman.<ref>{{cite web|URL=http://blog.revolutionanalytics.com/2010/09/the-r-files-hadley-wickham.html |title= The R-Files: Hadley Wickham}}</ref> In 2006 he was awarded the [[John_Chambers_(statistician)|John Chambers]] Award for Statistical Computing for his work developing tools for data reshaping and visualisation.<ref>{{cite web|url=http://stat-computing.org/awards/jmc/winners.html |title=John Chambers Award Past winners|publisher=ASA Sections on Statistical Computing, Statistical Graphics,|date= |accessdate=2014-08-12}}</ref>\\n\\nHe is a prominent and active member of the [[R (programming language)|R]] user community and has developed several notable and widely used packages including [[ggplot2]], plyr, dplyr, and reshape2.<ref name=\\\"about\\\" /><ref>{{cite web|url=http://www.r-statistics.com/2013/06/top-100-r-packages-for-2013-jan-may/ |title=Top 100 R Packages for 2013 (Jan-May)! |publisher=R-statistics blog |date= |accessdate=2014-08-12}}</ref>\"}]}}}}"
# Get revision history for "Hadley Wickham"
resp_json <- rev_history("Hadley Wickham")
# Check http_type() of resp_json
http_type(resp_json)
# Examine returned text with content()
content(resp_json, as="text")
# Parse response with content()
content(resp_json, as="parsed")
# Parse returned text with fromJSON()
library(jsonlite)
fromJSON(content(resp_json, as="text"))
# Load rlist
library(rlist)
# Examine output of this code
str(content(resp_json), max.level = 4)
# Store revision list
revs <- content(resp_json)$query$pages$`41916270`$revisions
# Extract the user element
user_time <- list.select(revs, user, timestamp)
# Print user_time
user_time
# Stack to turn into a data frame
list.stack(user_time)
# Load dplyr
library(dplyr)
# Pull out revision list
revs <- content(resp_json)$query$pages$`41916270`$revisions
# Extract user and timestamp
revs %>%
bind_rows() %>%
select(user, timestamp)
# Load xml2
library(xml2)
# Get XML revision history
resp_xml <- rev_history("Hadley Wickham", format = "xml")
# Check response is XML
http_type(resp_xml)
# Examine returned text with content()
rev_text <- content(resp_xml, as="text")
rev_text
# Turn rev_text into an XML document
rev_xml <- read_xml(rev_text)
# Examine the structure of rev_xml
str(rev_xml)
# Load xml2
library(xml2)
# Get XML revision history
resp_xml <- rev_history("Hadley Wickham", format = "xml")
# Check response is XML
http_type(resp_xml)
# Examine returned text with content()
rev_text <- content(resp_xml, as="text")
rev_text
# Turn rev_text into an XML document
rev_xml <- read_xml(rev_text)
# Examine the structure of rev_xml
xml_structure(rev_xml)
# Find all nodes using XPATH "/api/query/pages/page/revisions/rev"
xml_find_all(rev_xml, "/api/query/pages/page/revisions/rev")
# Find all rev nodes anywhere in document
rev_nodes <- xml_find_all(rev_xml, "//rev")
# Use xml_text() to get text from rev_nodes
xml_text(rev_nodes)
# All rev nodes
rev_nodes <- xml_find_all(rev_xml, "//rev")
# The first rev node
first_rev_node <- xml_find_first(rev_xml, "//rev")
# Find all attributes with xml_attrs()
xml_attrs(first_rev_node)
# Find user attribute with xml_attr()
xml_attr(first_rev_node, attr="user")
# Find user attribute for all rev nodes
xml_attr(rev_nodes, attr="user")
# Find anon attribute for all rev nodes
xml_attr(rev_nodes, attr="anon")
get_revision_history <- function(article_title){
# Get raw revision response
rev_resp <- rev_history(article_title, format = "xml")
# Turn the content() of rev_resp into XML
rev_xml <- read_xml(content(rev_resp, "text"))
# Find revision nodes
rev_nodes <- xml_find_all(rev_xml, "//rev")
# Parse out usernames
user <- xml_attr(rev_nodes, attr="user")
# Parse out timestamps
timestamp <- readr::parse_datetime(xml_attr(rev_nodes, "timestamp"))
# Parse out content
content <- xml_text(rev_nodes)
# Return data frame
data.frame(user = user,
timestamp = timestamp,
content = substr(content, 1, 40))
}
# Call function for "Hadley Wickham"
get_revision_history(article_title = "Hadley Wickham")
Chapter 4 - Web Scraping with XPATH
Web scraping 101 - sometimes a website does not have an API, so a different approach is required:
HTML structure - basically, content within tags, much like XML:
This is a test
requests that “This is a test” be available in paragraph formReformatting data (especially to a rectangular format such as a data frame):
Example code includes:
# Load rvest
library(rvest)
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
##
## guess_encoding
# Hadley Wickham's Wikipedia page
test_url <- "https://en.wikipedia.org/wiki/Hadley_Wickham"
# Read the URL stored as "test_url" with read_html()
test_xml <- read_html(test_url)
# Print test_xml
test_xml
## {xml_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset= ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-sub ...
test_node_xpath <- "//*[contains(concat( \" \", @class, \" \" ), concat( \" \", \"vcard\", \" \" ))]"
# Use html_node() to grab the node with the XPATH stored as `test_node_xpath`
node <- html_node(x = test_xml, xpath = test_node_xpath)
# Print the first element of the result
node[1]
## $node
## <pointer: 0x000000000b91bb80>
# The first thing we'll grab is a name, from the first element of the previously extracted table (now stored as table_element)
table_element <- node
# Extract the name of table_element
element_name <- html_name(table_element)
# Print the name
element_name
## [1] "table"
second_xpath_val <- "//*[contains(concat( \" \", @class, \" \" ), concat( \" \", \"fn\", \" \" ))]"
# Extract the element of table_element referred to by second_xpath_val and store it as page_name
page_name <- html_node(x = table_element, xpath = second_xpath_val)
# Extract the text from page_name
page_title <- html_text(page_name)
# Print page_title
page_title
## [1] "Hadley Wickham"
# Turn table_element into a data frame and assign it to wiki_table
wiki_table <- html_table(table_element)
# Print wiki_table
wiki_table
## Hadley Wickham
## 1
## 2 Residence
## 3 Alma mater
## 4 Known for
## 5 Awards
## 6 Scientific career
## 7 Fields
## 8 Thesis
## 9 Doctoral advisors
## 10 Doctoral students
## 11
## Hadley Wickham
## 1
## 2 United States
## 3 Iowa State University, University of Auckland
## 4 R programming language packages
## 5 John Chambers Award (2006)\nFellow of the American Statistical Association (2015)
## 6 Scientific career
## 7 Statistics\nData science\nR (programming language)
## 8 Practical tools for exploring data and models (2008)
## 9 Di Cook\nHeike Hofmann
## 10 Garrett Grolemund
## 11
# Rename the columns of wiki_table
colnames(wiki_table) <- c("key", "value")
# Remove the empty row from wiki_table
cleaned_table <- subset(wiki_table, !(key == ""))
# Print cleaned_table
cleaned_table
## key
## 2 Residence
## 3 Alma mater
## 4 Known for
## 5 Awards
## 6 Scientific career
## 7 Fields
## 8 Thesis
## 9 Doctoral advisors
## 10 Doctoral students
## value
## 2 United States
## 3 Iowa State University, University of Auckland
## 4 R programming language packages
## 5 John Chambers Award (2006)\nFellow of the American Statistical Association (2015)
## 6 Scientific career
## 7 Statistics\nData science\nR (programming language)
## 8 Practical tools for exploring data and models (2008)
## 9 Di Cook\nHeike Hofmann
## 10 Garrett Grolemund
Chapter 5 - CSS Web Scraping and Final Case Study
CSS (cascading style sheets) web scraping in theory:
Final case study: Introduction:
Wrap up:
Example code includes:
library(rvest)
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
##
## guess_encoding
# Hadley Wickham's Wikipedia page
test_url <- "https://en.wikipedia.org/wiki/Hadley_Wickham"
# Read the URL stored as "test_url" with read_html()
test_xml <- read_html(test_url)
# Print test_xml
test_xml
## {xml_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset= ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-sub ...
# Select the table elements
html_nodes(test_xml, css = "table")
## {xml_nodeset (2)}
## [1] <table class="infobox biography vcard" style="width:22em">\n<tr>\n<t ...
## [2] <table class="nowraplinks hlist navbox-inner" style="border-spacing: ...
# Select elements with class = "infobox"
html_nodes(test_xml, css = ".infobox")
## {xml_nodeset (1)}
## [1] <table class="infobox biography vcard" style="width:22em">\n<tr>\n<t ...
# Select elements with id = "firstHeading"
html_nodes(test_xml, css = "#firstHeading")
## {xml_nodeset (1)}
## [1] <h1 id="firstHeading" class="firstHeading" lang="en">Hadley Wickham< ...
# Extract element with class infobox
infobox_element <- html_nodes(test_xml, css = ".infobox")
# Get tag name of infobox_element
element_name <- html_name(infobox_element)
# Print element_name
element_name
## [1] "table"
# Extract element with class fn
page_name <- html_node(x = infobox_element, css=".fn")
# Get contents of page_name
page_title <- html_text(page_name)
# Print page_title
page_title
## [1] "Hadley Wickham"
# Load httr
library(httr)
# The API url
base_url <- "https://en.wikipedia.org/w/api.php"
# Set query parameters
query_params <- list(action="parse",
page="Hadley Wickham",
format="xml")
# Get data from API
resp <- GET(url = "https://en.wikipedia.org/w/api.php", query = query_params)
# Parse response
resp_xml <- content(resp)
# Load rvest
library(rvest)
# Read page contents as HTML
page_html <- read_html(xml_text(resp_xml))
# Extract infobox element
infobox_element <- html_node(page_html, css=".infobox")
# Extract page name element from infobox
page_name <- html_node(infobox_element, css=".fn")
# Extract page name as text
page_title <- html_text(page_name)
# Your code from earlier exercises
wiki_table <- html_table(infobox_element)
colnames(wiki_table) <- c("key", "value")
cleaned_table <- subset(wiki_table, !key == "")
# Create a dataframe for full name
name_df <- data.frame(key = "Full name", value = page_title)
# Combine name_df with cleaned_table
wiki_table2 <- rbind(name_df, cleaned_table)
# Print wiki_table
wiki_table2
## key
## 1 Full name
## 2 Residence
## 3 Alma mater
## 4 Known for
## 5 Awards
## 6 Scientific career
## 7 Fields
## 8 Thesis
## 9 Doctoral advisors
## 10 Doctoral students
## value
## 1 Hadley Wickham
## 2 United States
## 3 Iowa State University, University of Auckland
## 4 R programming language packages
## 5 John Chambers Award (2006)\nFellow of the American Statistical Association (2015)
## 6 Scientific career
## 7 Statistics\nData science\nR (programming language)
## 8 Practical tools for exploring data and models (2008)
## 9 Di Cook\nHeike Hofmann
## 10 Garrett Grolemund
library(httr)
library(rvest)
library(xml2)
get_infobox <- function(title){
base_url <- "https://en.wikipedia.org/w/api.php"
# Change "Hadley Wickham" to title
query_params <- list(action = "parse",
page = title,
format = "xml")
resp <- GET(url = base_url, query = query_params)
resp_xml <- content(resp)
page_html <- read_html(xml_text(resp_xml))
infobox_element <- html_node(x = page_html, css =".infobox")
page_name <- html_node(x = infobox_element, css = ".fn")
page_title <- html_text(page_name)
wiki_table <- html_table(infobox_element)
colnames(wiki_table) <- c("key", "value")
cleaned_table <- subset(wiki_table, !wiki_table$key == "")
name_df <- data.frame(key = "Full name", value = page_title)
wiki_table <- rbind(name_df, cleaned_table)
wiki_table
}
# Test get_infobox with "Hadley Wickham"
get_infobox(title = "Hadley Wickham")
## key
## 1 Full name
## 2 Residence
## 3 Alma mater
## 4 Known for
## 5 Awards
## 6 Scientific career
## 7 Fields
## 8 Thesis
## 9 Doctoral advisors
## 10 Doctoral students
## value
## 1 Hadley Wickham
## 2 United States
## 3 Iowa State University, University of Auckland
## 4 R programming language packages
## 5 John Chambers Award (2006)\nFellow of the American Statistical Association (2015)
## 6 Scientific career
## 7 Statistics\nData science\nR (programming language)
## 8 Practical tools for exploring data and models (2008)
## 9 Di Cook\nHeike Hofmann
## 10 Garrett Grolemund
# Try get_infobox with "Ross Ihaka"
get_infobox(title = "Ross Ihaka")
## key
## 1 Full name
## 2 Ihaka at the 2010 New Zealand Open Source Awards
## 3 Residence
## 4 Alma mater
## 5 Known for
## 6 Awards
## 7 Scientific career
## 8 Fields
## 9 Institutions
## 10 Thesis
## 11 Doctoral advisor
## value
## 1 Ross Ihaka
## 2 Ihaka at the 2010 New Zealand Open Source Awards
## 3 Auckland, New Zealand
## 4 University of AucklandUniversity of California, Berkeley
## 5 R programming language
## 6 Pickering Medal (2008)
## 7 Scientific career
## 8 Statistical Computing
## 9 University of Auckland
## 10 Ruaumoko (1985)
## 11 David R. Brillinger
# Try get_infobox with "Grace Hopper"
get_infobox(title = "Grace Hopper")
## key
## 1 Full name
## 2 Rear Admiral Grace M. Hopper, 1984
## 3 Nickname(s)
## 4 Born
## 5 Died
## 6 Place of burial
## 7 Allegiance
## 8 Service/branch
## 9 Years of service
## 10 Rank
## 11 Awards
## value
## 1 Grace Murray Hopper
## 2 Rear Admiral Grace M. Hopper, 1984
## 3 "Amazing Grace"
## 4 (1906-12-09)December 9, 1906\nNew York City, New York, U.S.
## 5 January 1, 1992(1992-01-01) (aged 85)Arlington, Virginia, U.S.
## 6 Arlington National Cemetery
## 7 United States of America
## 8 United States Navy
## 9 19431966, 19671971, 19721986
## 10 Rear admiral (lower half)
## 11 Defense Distinguished Service Medal Legion of Merit Meritorious Service Medal American Campaign Medal World War II Victory Medal National Defense Service Medal Armed Forces Reserve Medal with two Hourglass Devices Naval Reserve Medal Presidential Medal of Freedom (posthumous)
Chapter 1 - Basic plotting with lattice
Introduction - general objectives:
Optional arguments:
Box and whisker plots and reordering elements:
Example code includes:
data(airquality)
str(airquality)
## 'data.frame': 153 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
# Load the lattice package
library(lattice)
# Create the histogram
histogram(~ Ozone, data = airquality)
# Create the histogram
histogram(~ Ozone, data = airquality,
# Specify number of bins
nint = 15,
# Specify quantity displayed on y-axis
type = "count")
# Create the scatter plot
xyplot(Ozone ~ Solar.R, data = airquality)
# Create scatterplot
xyplot(Ozone ~ Temp, data = airquality,
# Add main label
main = "Environmental conditions in New York City (1973)",
# Add axis labels
ylab = "Ozone (ppb)",
xlab = "Temperature (Fahrenheit)")
# Create a density plot
densityplot(~ Ozone, data = airquality,
# Choose how raw data is shown
plot.points = "jitter")
data(USCancerRates, package="latticeExtra")
str(USCancerRates)
## 'data.frame': 3041 obs. of 8 variables:
## $ rate.male : num 364 346 341 336 330 ...
## $ LCL95.male : num 311 274 304 289 293 ...
## $ UCL95.male : num 423 431 381 389 371 ...
## $ rate.female : num 151 140 182 185 172 ...
## $ LCL95.female: num 124 103 161 157 151 ...
## $ UCL95.female: num 184 190 206 218 195 ...
## $ state : Factor w/ 49 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ county :Class 'AsIs' chr [1:3041] "Pickens County" "Bullock County" "Russell County" "Barbour County" ...
rn_USCR <- row.names(USCancerRates)
# Create reordered variable
library(dplyr)
USCancerRates <-
mutate(USCancerRates,
state.ordered = reorder(state, rate.female, median, na.rm = TRUE)
)
# Create box and whisker plot
bwplot(state.ordered ~ rate.female, data = USCancerRates)
# Create box and whisker plot
bwplot(state.ordered ~ rate.female, data = USCancerRates,
# Change whiskers extent
coef = 0)
Chapter 2 - Conditioning and the Formula Interface
Conditioning - identify sources of variability in the data by examining sub-groups:
Data summary and transformation - grouping:
Incorporating external data sources:
The trellis object - lattice creates trellis objects rather than directly creating plots (as in base R):
Example code includes:
# The airquality dataset has been pre-loaded
str(airquality)
## 'data.frame': 153 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
# Create a histogram
histogram(~ Ozone | factor(Month),
data = airquality,
# Define the layout
layout=c(2, 3),
# Change the x-axis label
xlab="Ozone (ppb)")
# USCancerRates has been pre-loaded
str(USCancerRates)
## 'data.frame': 3041 obs. of 9 variables:
## $ rate.male : num 364 346 341 336 330 ...
## $ LCL95.male : num 311 274 304 289 293 ...
## $ UCL95.male : num 423 431 381 389 371 ...
## $ rate.female : num 151 140 182 185 172 ...
## $ LCL95.female : num 124 103 161 157 151 ...
## $ UCL95.female : num 184 190 206 218 195 ...
## $ state : Factor w/ 49 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ county :Class 'AsIs' chr [1:3041] "Pickens County" "Bullock County" "Russell County" "Barbour County" ...
## $ state.ordered: Factor w/ 49 levels "Utah","New Mexico",..: 25 25 25 25 25 25 25 25 25 25 ...
## ..- attr(*, "scores")= num [1:49(1d)] 166 166 145 169 159 ...
## .. ..- attr(*, "dimnames")=List of 1
## .. .. ..$ : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
# Create a density plot
densityplot(~ rate.male + rate.female,
data = USCancerRates,
outer = TRUE,
# Suppress data points
plot.points = FALSE,
# Add a reference line
ref=TRUE)
# Create a density plot
densityplot(~ rate.male + rate.female,
data = USCancerRates,
# Set value of 'outer'
outer=FALSE,
# Add x-axis label
xlab="Rate (per 100,000)",
# Add a legend
auto.key=TRUE,
plot.points = FALSE,
ref = TRUE)
xyplot(Ozone ~ Temp, airquality, groups = Month,
# Complete the legend spec
auto.key = list(space = "right",
title = "Month",
text = month.name[5:9]))
USCancerRates <- USCancerRates %>%
mutate(division=state.division[match(state, state.name)])
# Create 'division.ordered' by reordering levels
USCancerRates <-
mutate(USCancerRates,
division.ordered = reorder(division,
rate.male + rate.female,
mean, na.rm = TRUE))
# Create conditioned scatter plot
xyplot(rate.female ~ rate.male | division.ordered,
data = USCancerRates,
# Add reference grid
grid = TRUE,
# Add reference line
abline = c(0, 1))
# Levels of division.ordered
levels(USCancerRates$division.ordered)
## [1] "Mountain" "West North Central" "Pacific"
## [4] "Middle Atlantic" "New England" "East North Central"
## [7] "West South Central" "South Atlantic" "East South Central"
# Specify the as.table argument
xyplot(rate.female ~ rate.male | division.ordered,
data = USCancerRates,
grid = TRUE, abline = c(0, 1),
as.table=TRUE)
# Create box-and-whisker plot
bwplot(division.ordered ~ rate.male + rate.female,
data = USCancerRates,
outer = TRUE,
# Add a label for the x-axis
xlab="Rate (per 100,000)",
# Add strip labels
strip = strip.custom(factor.levels = c("Male", "Female")))
# Create "trellis" object
tplot <-
densityplot(~ rate.male + rate.female | division.ordered,
data = USCancerRates, outer = TRUE,
plot.points = FALSE, as.table = TRUE)
# Change names for the second dimension
dimnames(tplot)[[2]] <- c("Male", "Female")
# Update x-axis label and plot
update(tplot, xlab = "Rate")
# Create "trellis" object
tplot <-
densityplot(~ rate.male + rate.female | division.ordered,
data = USCancerRates, outer = TRUE,
plot.points = FALSE, as.table = TRUE)
# Inspect dimension
dim(tplot)
## [1] 9 2
dimnames(tplot)
## $division.ordered
## [1] "Mountain" "West North Central" "Pacific"
## [4] "Middle Atlantic" "New England" "East North Central"
## [7] "West South Central" "South Atlantic" "East South Central"
##
## [[2]]
## [1] "rate.male" "rate.female"
# Select subset retaining only last three divisions
tplot[7:9, ]
Chapter 3 - Controlling scales and graphical parameters
Combining scales:
Logarithmic scales:
Graphical parameters:
Using simpleTheme():
Example code includes:
# The lattice package and the USMortality dataset have been pre-loaded.
Status <- factor(c('Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural'), levels=c("Rural", "Urban")
)
Sex <- factor(c('Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female'), levels=c("Female", "Male")
)
Cause <- factor(c('Heart disease', 'Heart disease', 'Heart disease', 'Heart disease', 'Cancer', 'Cancer', 'Cancer', 'Cancer', 'Lower respiratory', 'Lower respiratory', 'Lower respiratory', 'Lower respiratory', 'Unintentional injuries', 'Unintentional injuries', 'Unintentional injuries', 'Unintentional injuries', 'Cerebrovascular diseases', 'Cerebrovascular diseases', 'Cerebrovascular diseases', 'Cerebrovascular diseases', 'Alzheimers', 'Alzheimers', 'Alzheimers', 'Alzheimers', 'Diabetes', 'Diabetes', 'Diabetes', 'Diabetes', 'Flu and pneumonia', 'Flu and pneumonia', 'Flu and pneumonia', 'Flu and pneumonia', 'Suicide', 'Suicide', 'Suicide', 'Suicide', 'Nephritis', 'Nephritis', 'Nephritis', 'Nephritis'),
levels=c('Alzheimers', 'Cancer', 'Cerebrovascular diseases', 'Diabetes', 'Flu and pneumonia', 'Heart disease', 'Lower respiratory', 'Nephritis', 'Suicide', 'Unintentional injuries')
)
Rate <- c(210.2, 242.7, 132.5, 154.9, 195.9, 219.3, 140.2, 150.8, 44.5, 62.8, 36.5, 46.9, 49.6, 71.3, 24.7, 37.2, 36.1, 42.2, 34.9, 42.2, 19.4, 21.8, 25.5, 30.6, 24.9, 29.5, 17.1, 21.8, 17.7, 20.8, 12.9, 16.3, 19.2, 26.3, 5.3, 6.2, 15.7, 18.3, 10.7, 13.9)
SE <- c(0.2, 0.6, 0.2, 0.4, 0.2, 0.5, 0.2, 0.4, 0.1, 0.3, 0.1, 0.2, 0.1, 0.3, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.1, 0.1, 0.2, 0, 0.1, 0.1, 0.2, 0, 0.1)
USMortality <- data.frame(Status=Status, Sex=Sex, Cause=Cause, Rate=Rate, SE=SE)
# Specify upper bound to exclude Heart disease and Cancer
x_limits <- c(0, 100)
# Draw the plot
dotplot(Cause ~ Rate | Sex + Status, data = USMortality, as.table = TRUE,
xlim = x_limits)
dotplot(Cause ~ Rate | Sex + Status, data = USMortality,
as.table = TRUE,
scales = list(x = list(relation = "free",
# Specify limits for each panel
limits = list(c(0, 50), c(0, 80),
c(0, 50), c(0, 80) ))))
dotplot(Cause ~ Rate | Sex + Status, data = USMortality,
as.table = TRUE,
# Change the number of tick marks
scales = list(x = list(tick.number = 10,
# Show `Rate` labels on both bottom and top
alternating = 3,
# Rotate `Rate` labels by 90 degrees
rot = 90),
# Rotate `Cause` labels by 45 degrees
y = list(rot = 45)))
# Define at as 2^3 up to 2^8
x_ticks_at <- 2 ** (3:8)
dotplot(Cause ~ Rate | Sex, data = USMortality,
groups = Status, auto.key = list(columns = 2),
scales = list(x = list(log = 2,
# A numeric vector with
# values 2^3, 2^4, ..., 2^8
at = x_ticks_at,
# A character vector,
# "8" for 2^3, "16" for 2^4, etc.
labels = x_ticks_at)))
# Create the dot plot
dotplot(Cause ~ Rate | Status, data = USMortality,
groups = Sex, auto.key = list(columns = 2),
scales = list(x = list(log = TRUE,
equispaced.log = FALSE)),
# Provide pch values for the two groups
pch = c(3, 1))
dotplot(Cause ~ Rate | Status, data = USMortality,
groups = Sex, auto.key = list(columns = 2),
par.settings = simpleTheme(pch = c(3, 1)),
scales = list(x = list(log = 2, equispaced.log = FALSE)))
# The WorldPhones matrix is already provided, with the first row removed so you only need consider consecutive years
data(WorldPhones)
WorldPhones <- WorldPhones[row.names(WorldPhones) != 1951, ]
WorldPhones
## N.Amer Europe Asia S.Amer Oceania Africa Mid.Amer
## 1956 60423 29990 4708 2568 2366 1411 733
## 1957 64721 32510 5230 2695 2526 1546 773
## 1958 68484 35218 6662 2845 2691 1663 836
## 1959 71799 37598 6856 3000 2868 1769 911
## 1960 76036 40341 8220 3145 3054 1905 1008
## 1961 79831 43173 9053 3338 3224 2005 1076
names(dimnames(WorldPhones)) <- c("Year", "Region")
# Transform matrix data to data frame
WorldPhonesDF <- as.data.frame(
# Intermediate step: convert to table
as.table(WorldPhones),
responseName = "Phones")
# Create the dot plot
dotplot(Year ~ Phones | Region,
data = WorldPhonesDF,
as.table = TRUE,
# Log-transform the x-axis
scales = list(x = list(log = TRUE,
equispaced.log = FALSE,
# Set x-axis relation to "sliced"
relation = "sliced")))
# Load latticeExtra package for ggplot2like()
library(latticeExtra)
## Loading required package: RColorBrewer
##
## Attaching package: 'latticeExtra'
## The following object is masked _by_ '.GlobalEnv':
##
## USCancerRates
## The following object is masked from 'package:ggplot2':
##
## layer
# Transform matrix data to data frame
names(dimnames(WorldPhones)) <- c("Year", "Region")
WorldPhonesDF <-
as.data.frame(as.table(WorldPhones[-1, ]),
responseName = "Phones")
# Create the dot plot
dotplot(Year ~ Phones | Region,
data = WorldPhonesDF,
as.table = TRUE,
scales = list(x = list(log = TRUE,
equispaced.log = FALSE,
relation = "sliced")),
# Fill in suitable value of par.settings
par.settings = ggplot2like(),
# Fill in suitable value of lattice.options
lattice.options = ggplot2like.opts())
# Create factor variable
airquality$Month.Name <-
factor(airquality$Month, levels = 1:12,
labels = month.name[1:12])
# Create histogram of Ozone, conditioning on Month
histogram(~ Ozone | Month.Name,
data = airquality, as.table = TRUE,
# Set border to be transparent
border = "transparent",
# Set fill color to be mid-gray
col = "grey50")
# Create factor variable
airquality$Month.Name <-
factor(airquality$Month, levels = 1:12,
labels = month.name)
levels(airquality$Month.Name)
## [1] "January" "February" "March" "April" "May"
## [6] "June" "July" "August" "September" "October"
## [11] "November" "December"
# Drop empty levels
airquality$Month.Name <- droplevels(airquality$Month.Name)
levels(airquality$Month.Name)
## [1] "May" "June" "July" "August" "September"
# Obtain colors from RColorBrewer
library(RColorBrewer)
my.colors <- brewer.pal(n = 5, name = "Set1")
# Density plot of ozone concentration grouped by month
densityplot(~ Ozone, data = airquality, groups = Month.Name,
plot.points = FALSE,
auto.key = list(space = "right"),
# Fill in value of col
par.settings = simpleTheme(col = my.colors,
# Fill in value of lwd
lwd = 2))
Chapter 4 - Customizing plots using panel functions
Panel functions:
Prepanel Functions to control limits:
Optional arguments of default panel functions:
Example code includes:
panel.xyrug <- function(x, y, ...)
{
# Reproduce standard scatter plot
panel.xyplot(x, y, ...)
# Identify observations with x-value missing
x.missing <- is.na(x)
# Identify observations with y-value missing
y.missing <- is.na(y)
# Draw rugs along axes
panel.rug(x = x[y.missing], y = y[x.missing])
}
airquality$Month.Name <-
factor(month.name[airquality$Month], levels = month.name)
xyplot(Ozone ~ Solar.R | Month.Name, data = airquality,
panel = panel.xyrug, as.table = TRUE)
# Create factor variable with month names
airquality$Month.Name <-
factor(month.name[airquality$Month], levels = month.name)
# Create box-and-whisker plot
bwplot(Month.Name ~ Ozone + Temp, airquality,
# Specify outer
outer=TRUE,
# Specify x-axis relation
scales = list(x = list(relation="free")),
# Specify layout
layout=c(2, 1),
# Specify x-axis label
xlab="Measured value")
# Create violin plot
bwplot(Month.Name ~ Ozone + Temp, airquality,
# Specify outer
outer = TRUE,
# Specify x-axis relation
scales = list(x = list(relation="free")),
# Specify layout
layout=c(2, 1),
# Specify x-axis label
xlab="Measured value",
# Replace default panel function
panel = panel.violin)
# Create panel function
panel.ss <- function(x, y, ...) {
# Call panel.smoothScatter()
panel.smoothScatter(x, y, ...)
# Call panel.loess()
panel.loess(x, y, col = "red")
# Call panel.abline()
panel.abline(0, 1)
}
# Create plot
xyplot(rate.female ~ rate.male, data = USCancerRates,
panel = panel.ss,
main = "County-wise deaths due to cancer")
## (loaded the KernSmooth namespace)
# Define prepanel function
prepanel.histdens.2 <- function(x, ...) {
h <- prepanel.default.histogram(x, ...)
d <- density(x, na.rm = TRUE)
list(xlim = quantile(x, c(0.005, 0.995), na.rm = TRUE),
# Calculate upper y-limit
ylim = c(0, max(d$y, h$ylim[2])))
}
panel.histdens <- function(x, ...) {
panel.histogram(x, ...)
panel.lines(density(x, na.rm = TRUE))
}
# Create a histogram of rate.male and rate.female
histogram(~ rate.male + rate.female,
data = USCancerRates, outer = TRUE,
type = "density", nint = 50,
border = "transparent", col = "lightblue",
# The panel function: panel.histdens
panel = panel.histdens,
# The prepanel function: prepanel.histdens.2
prepanel = prepanel.histdens.2,
# Ensure that the x-axis is log-transformed
# and has relation "sliced"
scales = list(x = list(log = TRUE,
equispaced.log = FALSE,
relation = "sliced")),
xlab = "Rate (per 100,000)")
# Create the box and whisker plot
bwplot(division.ordered ~ rate.male,
data = USCancerRates,
# Indicate median by line instead of dot
pch = "|",
# Include notches for confidence interval
notch = TRUE,
# The x-axis should plot log-transformed values
scales = list(x = list(log=TRUE, equispaced.log=FALSE)),
xlab = "Death Rate in Males (per 100,000)")
# Load the 'latticeExtra' package
library(latticeExtra)
# Create summary dataset
USCancerRates.state <-
with(USCancerRates, {
rmale <- tapply(rate.male, state, median, na.rm = TRUE)
rfemale <- tapply(rate.female, state, median, na.rm = TRUE)
data.frame(Rate = c(rmale, rfemale),
State = rep(names(rmale), 2),
Gender = rep(c("Male", "Female"),
each = length(rmale)))
})
# Reorder levels
library(dplyr)
USCancerRates.state <-
mutate(USCancerRates.state, State = reorder(State, Rate))
head(USCancerRates.state)
## Rate State Gender
## 1 286.00 Alabama Male
## 2 237.95 Alaska Male
## 3 209.30 Arizona Male
## 4 284.10 Arkansas Male
## 5 221.30 California Male
## 6 204.40 Colorado Male
# URLs for emojis
emoji.man <- "https://twemoji.maxcdn.com/72x72/1f468.png"
emoji.woman <- "https://twemoji.maxcdn.com/72x72/1f469.png"
# Create dotplot
# dotplot(State ~ Rate, data = USCancerRates.state,
# Specify grouping variable
# groups = Gender,
# Specify panel function
# panel = panel.xyimage,
# Specify emoji URLs
# pch = c(emoji.woman, emoji.man),
# Make symbols smaller
# cex = 0.75)
Chapter 5 - Extensions and the lattice ecosystem
New methods - lattice is used by many packages because it is highly extensible:
New high-level functions can be created:
Manipulation (extension) of trellis objects:
Example code includes:
# Use 'EuStockMarkets' time series data
data(EuStockMarkets)
str(EuStockMarkets)
## Time-Series [1:1860, 1:4] from 1991 to 1999: 1629 1614 1607 1621 1618 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:4] "DAX" "SMI" "CAC" "FTSE"
# Create time series plot
xyplot(EuStockMarkets,
# Plot all series together
superpose = TRUE,
# Split up the time axis into parts
cut = list(number = 3, overlap = 0.25))
# Create time series plot
xyplot(EuStockMarkets,
# Specify panel function
panel=panel.horizonplot,
# Specify prepanel function
prepanel=prepanel.horizonplot)
# Load required packages
library(maps)
# Create map object for US counties
county.map <- map("county", plot = FALSE, fill = TRUE,
# Specify projection
projection = "sinusoidal")
# Create choropleth map
row.names(USCancerRates) <- rn_USCR
mapplot(row.names(USCancerRates) ~ log10(rate.male) + log10(rate.female),
data = USCancerRates,
xlab = "", scales = list(draw = FALSE),
# Specify map
map = county.map)
## Warning in (function (x, y, map, breaks, colramp, exact = FALSE,
## lwd = 0.5, : 64 unmatched regions: alaska,nome, alaska,wade hampton,
## alaska,haines, alaska,....
## Warning in (function (x, y, map, breaks, colramp, exact = FALSE,
## lwd = 0.5, : 64 unmatched regions: alaska,nome, alaska,wade hampton,
## alaska,haines, alaska,....
# Create subset for Louisiana
LACancerRates1 <- filter(USCancerRates, state == "Louisiana")
str(LACancerRates1)
## 'data.frame': 64 obs. of 11 variables:
## $ rate.male : num 369 361 349 338 338 ...
## $ LCL95.male : num 316 289 302 308 303 ...
## $ UCL95.male : num 428 446 402 372 376 ...
## $ rate.female : num 162 193 215 194 192 ...
## $ LCL95.female : num 134 150 184 176 170 ...
## $ UCL95.female : num 196 246 250 215 218 ...
## $ state : Factor w/ 49 levels "Alabama","Alaska",..: 17 17 17 17 17 17 17 17 17 17 ...
## $ county :Class 'AsIs' chr [1:64] "Richland Parish" "Madison Parish" "De Soto Parish" "St. Bernard Parish" ...
## $ state.ordered : Factor w/ 49 levels "Utah","New Mexico",..: 46 46 46 46 46 46 46 46 46 46 ...
## ..- attr(*, "scores")= num [1:49(1d)] 166 166 145 169 159 ...
## .. ..- attr(*, "dimnames")=List of 1
## .. .. ..$ : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ division : Factor w/ 9 levels "New England",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ division.ordered: Factor w/ 9 levels "Mountain","West North Central",..: 7 7 7 7 7 7 7 7 7 7 ...
## ..- attr(*, "scores")= num [1:9(1d)] 417 416 446 466 433 ...
## .. ..- attr(*, "dimnames")=List of 1
## .. .. ..$ : chr "New England" "Middle Atlantic" "South Atlantic" "East South Central" ...
# Reorder levels of county
LACancerRates2 <-
mutate(LACancerRates1,
county = reorder(county, rate.male))
# Draw confidence intervals
segplot(county ~ LCL95.male + UCL95.male,
data = LACancerRates2,
# Add point estimates
centers = rate.male,
# Draw segments rather than bands
draw.bands = FALSE)
# The 'USCancerRates' dataset
str(USCancerRates)
## 'data.frame': 3041 obs. of 11 variables:
## $ rate.male : num 364 346 341 336 330 ...
## $ LCL95.male : num 311 274 304 289 293 ...
## $ UCL95.male : num 423 431 381 389 371 ...
## $ rate.female : num 151 140 182 185 172 ...
## $ LCL95.female : num 124 103 161 157 151 ...
## $ UCL95.female : num 184 190 206 218 195 ...
## $ state : Factor w/ 49 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ county :Class 'AsIs' chr [1:3041] "Pickens County" "Bullock County" "Russell County" "Barbour County" ...
## $ state.ordered : Factor w/ 49 levels "Utah","New Mexico",..: 25 25 25 25 25 25 25 25 25 25 ...
## ..- attr(*, "scores")= num [1:49(1d)] 166 166 145 169 159 ...
## .. ..- attr(*, "dimnames")=List of 1
## .. .. ..$ : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ division : Factor w/ 9 levels "New England",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ division.ordered: Factor w/ 9 levels "Mountain","West North Central",..: 9 9 9 9 9 9 9 9 9 9 ...
## ..- attr(*, "scores")= num [1:9(1d)] 417 416 446 466 433 ...
## .. ..- attr(*, "dimnames")=List of 1
## .. .. ..$ : chr "New England" "Middle Atlantic" "South Atlantic" "East South Central" ...
# Load the 'hexbin' package
library(hexbin)
# Create hexbin plot
hexbinplot(rate.female ~ rate.male,
data = USCancerRates,
# Add a regression line
type = "r",
# function to transform counts
trans = sqrt,
# function to invert transformed counts
inv = function(x) x^2
)
# Load the 'directlabels' package
library(directlabels)
# Use the 'airquality' dataset
str(airquality)
## 'data.frame': 153 obs. of 7 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R : int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Month.Name: Factor w/ 12 levels "January","February",..: 5 5 5 5 5 5 5 5 5 5 ...
# Create factor variable
airquality$Month.Name <-
factor(month.name[airquality$Month], levels = month.name)
# Create density plot object
tplot2 <-
densityplot(~ Ozone + Temp, data = airquality,
# Variables should go in different panels
outer = TRUE,
# Specify grouping variable
groups = Month.Name,
# Suppress display of data points
plot.points = FALSE,
# Add reference line
ref = TRUE,
# Specify layout
layout = c(2, 1),
# Omit strip labels
strip = FALSE,
# Provide column-specific x-axis labels
xlab = c("Ozone (ppb)", "Temperature (F)"),
# Let panels have independent scales
scales = list(relation="free"))
# Produce plot with direct labels
direct.label(tplot2)
# 'USCancerRates' is pre-loaded
str(USCancerRates)
## 'data.frame': 3041 obs. of 11 variables:
## $ rate.male : num 364 346 341 336 330 ...
## $ LCL95.male : num 311 274 304 289 293 ...
## $ UCL95.male : num 423 431 381 389 371 ...
## $ rate.female : num 151 140 182 185 172 ...
## $ LCL95.female : num 124 103 161 157 151 ...
## $ UCL95.female : num 184 190 206 218 195 ...
## $ state : Factor w/ 49 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ county :Class 'AsIs' chr [1:3041] "Pickens County" "Bullock County" "Russell County" "Barbour County" ...
## $ state.ordered : Factor w/ 49 levels "Utah","New Mexico",..: 25 25 25 25 25 25 25 25 25 25 ...
## ..- attr(*, "scores")= num [1:49(1d)] 166 166 145 169 159 ...
## .. ..- attr(*, "dimnames")=List of 1
## .. .. ..$ : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ division : Factor w/ 9 levels "New England",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ division.ordered: Factor w/ 9 levels "Mountain","West North Central",..: 9 9 9 9 9 9 9 9 9 9 ...
## ..- attr(*, "scores")= num [1:9(1d)] 417 416 446 466 433 ...
## .. ..- attr(*, "dimnames")=List of 1
## .. .. ..$ : chr "New England" "Middle Atlantic" "South Atlantic" "East South Central" ...
# Create scatter plot
p <- xyplot(rate.female ~ rate.male, data = USCancerRates,
# Change plotting character
pch = 16,
# Make points semi-transparent
alpha = 0.25)
# Create layer with reference grid
l0 <- layer_(panel.grid())
# Create layer with reference line
l1 <- layer(panel.abline(0, 1))
# Create layer with regression fit
l2 <- layer(panel.smoother(x, y, method="lm"))
# Combine and plot
p + l0 + l1 + l2
Chapter 1 - R Time Series Visualization Tools
Refresher on xts and the plot() function:
Other useful visualizing functions:
Example code includes:
library(xts)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
# data is a 504x4 xts object of Yahoo, Microsoft, Citigroup, and Dow
tmpData <- readr::read_delim("./RInputFiles/dataset_1_1.csv", delim=" ")
## Parsed with column specification:
## cols(
## Index = col_date(format = ""),
## yahoo = col_double(),
## microsoft = col_double(),
## citigroup = col_double(),
## dow_chemical = col_double()
## )
data <- xts::xts(tmpData[, -1], order.by=as.POSIXct(tmpData$Index))
# Display the first few lines of the data
head(data)
## yahoo microsoft citigroup dow_chemical
## 2015-01-01 18:00:00 50.17 44.30501 53.45259 42.48209
## 2015-01-04 18:00:00 49.13 43.89759 51.76803 41.16821
## 2015-01-05 18:00:00 49.21 43.25329 49.94556 40.50662
## 2015-01-06 18:00:00 48.59 43.80284 50.40857 40.44139
## 2015-01-07 18:00:00 50.23 45.09144 51.16711 41.44776
## 2015-01-08 18:00:00 49.72 44.71244 50.02437 41.38253
# Display the column names of the data
colnames(data)
## [1] "yahoo" "microsoft" "citigroup" "dow_chemical"
# Plot yahoo data and add title
plot(data[, "yahoo"], main="yahoo")
# Replot yahoo data with labels for X and Y axes
plot(data[, "yahoo"], main="yahoo", xlab="date", ylab="price")
# Note that type="h" is for bars
# Plot the second time series and change title
plot(data[, 2], main="microsoft")
# Replot with same title, add subtitle, use bars
plot(data[, 2], main="microsoft", sub="Daily closing price since 2015", type="h")
# Change line color to red
lines(data[, 2], col="red")
# Plot two charts on same graphical window
par(mfrow = c(2, 1))
plot(data[, 1], main="yahoo")
plot(data[, 2], main="microsoft")
# Replot with reduced margin and character sizes
par(mfrow = c(2, 1), mex=0.6, cex=0.8)
plot(data[, 1], main="yahoo")
plot(data[, 2], main="microsoft")
par(mfrow = c(1, 1), mex=1, cex=1)
# Plot the "microsoft" series
plot(data[, "microsoft"], main="Stock prices since 2015")
# Add the "dow_chemical" series in red
lines(data[, "dow_chemical"], col="red")
# Add a Y axis on the right side of the chart
axis(side=4, at=pretty(data[, "dow_chemical"]))
# Add a legend in the bottom right corner
legend("bottomright", legend=c("microsoft", "dow_chemical"), col=c("black", "red"), lty=c(1, 1))
# Plot the "citigroup" time series
plot(data[, "citigroup"], main="Citigroup")
# Create vert_line to identify January 4th, 2016 in citigroup
vert_line <- which(index(data[, "citigroup"]) == as.POSIXct("2016-01-04"))
# Add a red vertical line using vert_line
abline(v = .index(data[, "citigroup"])[vert_line], col = "red")
# Create hori_line to identify average price of citigroup
hori_line <- mean(data[, "citigroup"])
# Add a blue horizontal line using hori_line
abline(h = hori_line, col = "blue")
# Create period to hold the 3 months of 2015
period <- c("2015-01/2015-03")
# Highlight the first three months of 2015
PerformanceAnalytics::chart.TimeSeries(data[, "citigroup"], period.areas=period)
# Highlight the first three months of 2015 in light grey
PerformanceAnalytics::chart.TimeSeries(data[, "citigroup"], period.areas=period, period.color="lightgrey")
# Plot the microsoft series
plot(data[, "microsoft"], main="Dividend date and amount")
# Add the citigroup series
lines(data[, "citigroup"], col="orange", lwd=2)
# Add a new y axis for the citigroup series
axis(side=4, at=pretty(data[, "citigroup"]), col="orange")
micro_div_date <- "15 Nov. 2016"
citi_div_date <- "13 Nov. 2016"
micro_div_value <- "$0.39"
citi_div_value <- "$0.16"
# Same plot as the previous exercise
plot(data$microsoft, main = "Dividend date and amount")
lines(data$citigroup, col = "orange", lwd = 2)
axis(side = 4, at = pretty(data$citigroup), col = "orange")
# Create the two legend strings
micro <- paste0("Microsoft div. of ", micro_div_value," on ", micro_div_date)
citi <- paste0("Citigroup div. of ", citi_div_value," on ", citi_div_date)
# Create the legend in the bottom right corner
legend(x = "bottomright", legend = c(micro, citi), col = c("black", "orange"), lty = c(1, 1))
data_1_1_old <- data
Chapter 2 - Univariate Time Series
Univariate time series analysis - deals with only a single variable:
Other visualization tools:
Combining everything so far:
Example code includes:
tmpData <- readr::read_delim("./RInputFiles/dataset_2_1.csv", delim=" ")
## Parsed with column specification:
## cols(
## Index = col_date(format = ""),
## Apple = col_double()
## )
names(tmpData) <- c("Index", "apple")
data <- xts::xts(tmpData[, -1], order.by=as.Date(tmpData$Index))
# indexClass(data) <- c("POSIXt", "POSIXlt")
# Plot Apple's stock price
plot(data[, "apple"], main="Apple stock price")
# Create a time series called rtn
rtn <- TTR::ROC(data[, "apple"])
# Plot Apple daily price and daily returns
par(mfrow=c(1, 2))
plot(data[, "apple"], main="Apple stock price")
plot(rtn)
par(mfrow=c(1, 1))
dim(rtn)
## [1] 522 1
rtn <- rtn[complete.cases(rtn), ]
dim(rtn)
## [1] 521 1
# Create a histogram of Apple stock returns
hist(rtn, main="Apple stock return distribution", probability=TRUE)
# Add a density line
lines(density(rtn[complete.cases(rtn), ]))
# Redraw a thicker, red density line
lines(density(rtn[complete.cases(rtn), ]), col="red", lwd=2)
rtnRaw <- as.double(rtn$apple)
# Draw box and whisker plot for the Apple returns
boxplot(rtnRaw)
# Draw a box and whisker plot of a normal distribution
boxplot(rnorm(1000))
# Redraw both plots on the same graphical window
par(mfrow=c(2, 1))
boxplot(rtnRaw, horizontal=TRUE)
boxplot(rnorm(1000), horizontal=TRUE)
par(mfrow=c(1, 1))
# Draw autocorrelation plot
acf(rtn, main="Apple return autocorrelation")
# Redraw with a maximum lag of 10
acf(rtn, main="Apple return autocorrelation", lag.max=10)
# Create q-q plot
qqnorm(rtn, main="Apple return QQ-plot")
# Add a red line showing normality
qqline(rtn, col="red")
par(mfrow=c(2, 2))
hist(rtn, probability=TRUE)
lines(density(rtn), col="red")
boxplot(rtnRaw)
acf(rtn)
qqnorm(rtn)
qqline(rtn, col="red")
par(mfrow=c(1, 1))
Chapter 3 - Multivariate Time Series
Dealing with higher dimensions - visualization challenges with larger numbers of series:
Multivariate time series:
Higher dimension time series:
Example code includes:
# You are provided with a dataset (portfolio) containing the weigths of stocks A (stocka) and B (stockb) in your portfolio for each month in 2016
stockA <- c(0.1, 0.4, 0.5, 0.5, 0.2, 0.3, 0.7, 0.8, 0.7, 0.2, 0.1, 0.2)
stockB <- c(0.9, 0.6, 0.5, 0.5, 0.8, 0.7, 0.3, 0.2, 0.3, 0.8, 0.9, 0.8)
pDates <- as.Date(c('2016-01-01', '2016-02-01', '2016-03-01', '2016-04-01', '2016-05-01', '2016-06-01', '2016-07-01', '2016-08-01', '2016-09-01', '2016-10-01', '2016-11-01', '2016-12-01'))
portfolio <- xts(data.frame(stocka=stockA, stockb=stockB), order.by=pDates)
# Plot stacked barplot
barplot(portfolio)
# Plot grouped barplot
barplot(portfolio, beside=TRUE)
tmpData <- readr::read_delim("./RInputFiles/data_3_2.csv", delim=",")
## Parsed with column specification:
## cols(
## Index = col_date(format = ""),
## sp500 = col_double(),
## citigroup = col_double(),
## microsoft = col_double(),
## apple = col_double(),
## dowchemical = col_double(),
## yahoo = col_double()
## )
# names(tmpData) <- c("Index", "apple")
my_data <- xts::xts(tmpData[, -1], order.by=as.Date(tmpData$Index))
citi <- as.numeric(my_data$citigroup)
sp500 <- as.numeric(my_data$sp500)
# Draw the scatterplot
plot(y=citi, x=sp500)
# Draw a regression line
abline(reg=lm(citi ~ sp500), col="red", lwd=2)
# my_data containing the returns for 5 stocks: ExxonMobile, Citigroup, Microsoft, Dow Chemical and Yahoo
# Create correlation matrix using Pearson method
cor(my_data)
## sp500 citigroup microsoft apple dowchemical yahoo
## sp500 1.0000000 0.5097953 0.3743215 0.3576966 0.5217243 0.2900962
## citigroup 0.5097953 1.0000000 0.4841408 0.4291841 0.5085190 0.4029490
## microsoft 0.3743215 0.4841408 1.0000000 0.5133469 0.3954523 0.4329388
## apple 0.3576966 0.4291841 0.5133469 1.0000000 0.3627755 0.3413626
## dowchemical 0.5217243 0.5085190 0.3954523 0.3627755 1.0000000 0.2938749
## yahoo 0.2900962 0.4029490 0.4329388 0.3413626 0.2938749 1.0000000
# Create correlation matrix using Spearman method
cor(my_data, method="spearman")
## sp500 citigroup microsoft apple dowchemical yahoo
## sp500 1.0000000 0.5192579 0.4244237 0.3518853 0.5316235 0.3262037
## citigroup 0.5192579 1.0000000 0.4976477 0.4374850 0.5607511 0.3780730
## microsoft 0.4244237 0.4976477 1.0000000 0.5128477 0.4684114 0.4448179
## apple 0.3518853 0.4374850 0.5128477 1.0000000 0.3681791 0.3680715
## dowchemical 0.5316235 0.5607511 0.4684114 0.3681791 1.0000000 0.3464743
## yahoo 0.3262037 0.3780730 0.4448179 0.3680715 0.3464743 1.0000000
# Create scatterplot matrix
pairs(as.data.frame(my_data))
# Create upper panel scatterplot matrix
pairs(as.data.frame(my_data), lower.panel=NULL)
cor_mat <- cor(my_data)
# In this exercise, you will use the provided correlation matrix cor_mat
# Create correlation matrix
corrplot::corrplot(cor_mat)
# Create correlation matrix with numbers
corrplot::corrplot(cor_mat, method="number")
# Create correlation matrix with colors
corrplot::corrplot(cor_mat, method="color")
# Create upper triangle correlation matrix
corrplot::corrplot(cor_mat, method="number", type="upper")
# Draw heatmap of cor_mat
corrplot::corrplot(cor_mat, method="color")
# Draw upper heatmap
corrplot::corrplot(cor_mat, method="color", type="upper")
# Draw the upper heatmap with hclust
corrplot::corrplot(cor_mat, method="color", type="upper", order="hclust")
Chapter 4 - Case Study: Stock Picking for Portfolios
Case study presentation:
New stocks:
Course conclusion:
Example code includes:
# In this exercise, you are provided with a dataset data containing the value and the return of the portfolio over time, in value and return, respectively.
tmpData <- readr::read_delim("./RInputFiles/data_4_1.csv", delim=",")
## Parsed with column specification:
## cols(
## Index = col_date(format = ""),
## value = col_double(),
## return = col_double()
## )
# names(tmpData) <- c("Index", "apple")
data <- xts::xts(tmpData[, -1], order.by=as.Date(tmpData$Index))
# indexClass(data) <- c("POSIXt", "POSIXlt")
# Plot the portfolio value
plot(data$value, main="Portfolio Value")
# Plot the portfolio return
plot(data$return, main="Portfolio Return")
# Plot a histogram of portfolio return
hist(data$return, probability=TRUE)
# Add a density line
lines(density(data$return), col="red", lwd=2)
tmpPortfolioData <- data
# The new dataset data containing four new stocks is available in your workspace: Goldman Sachs (GS), Coca-Cola (KO), Walt Disney (DIS), Caterpillar (CAT)
tmpData <- readr::read_delim("./RInputFiles/data_4_3.csv", delim=",")
## Parsed with column specification:
## cols(
## Index = col_date(format = ""),
## GS = col_double(),
## KO = col_double(),
## DIS = col_double(),
## CAT = col_double()
## )
# names(tmpData) <- c("Index", "apple")
data <- xts::xts(tmpData[, -1], order.by=as.Date(tmpData$Index))
# indexClass(data) <- c("POSIXt", "POSIXlt")
# Plot the four stocks on the same graphical window
par(mfrow=c(2, 2), mex=0.8, cex=0.8)
plot(data[, 1])
plot(data[, 2])
plot(data[, 3])
plot(data[, 4])
par(mfrow=c(1, 1), mex=1, cex=1)
# In this exercise, you are provided with four individual series containing the return of the same four stocks:
# gs, ko, dis, cat
# Solution makes absolutely no sense
portfolio <- as.numeric(tmpPortfolioData$return)
gs <- as.numeric(TTR::ROC(data[, "GS"]))[-1]
ko <- as.numeric(TTR::ROC(data[, "KO"]))[-1]
dis <- as.numeric(TTR::ROC(data[, "DIS"]))[-1]
cat <- as.numeric(TTR::ROC(data[, "CAT"]))[-1]
# Draw the scatterplot of gs against the portfolio
plot(y=portfolio, x=gs)
# Add a regression line in red
abline(reg=lm(gs ~ portfolio), col="red", lwd=2)
# Plot scatterplots and regression lines to a 2x2 window
par(mfrow=c(2, 2))
plot(y=portfolio, x=gs)
abline(reg=lm(gs ~ portfolio), col="red", lwd=2)
plot(y=portfolio, x=ko)
abline(reg=lm(ko ~ portfolio), col="red", lwd=2)
plot(y=portfolio, x=dis)
abline(reg=lm(dis ~ portfolio), col="red", lwd=2)
plot(y=portfolio, x=cat)
abline(reg=lm(cat ~ portfolio), col="red", lwd=2)
par(mfrow=c(1, 1))
# In this exercise, you are given a dataset old.vs.new.portfolio with the following self-explanatory columns: old.portfolio.value, new.portfolio.value, old.portfolio.rtn, new.portfolio.rtn
tmpData <- readr::read_delim("./RInputFiles/old.vs.new.portfolio.csv", delim=",")
## Parsed with column specification:
## cols(
## Index = col_date(format = ""),
## old.portfolio.value = col_double(),
## new.portfolio.value = col_double(),
## old.portfolio.rtn = col_double(),
## new.portfolio.rtn = col_double()
## )
# names(tmpData) <- c("Index", "apple")
old.vs.new.portfolio <- xts::xts(tmpData[, -1], order.by=as.Date(tmpData$Index))
# indexClass(data) <- c("POSIXt", "POSIXlt")
# Plot new and old portfolio values on same chart
plot(old.vs.new.portfolio$old.portfolio.value)
lines(old.vs.new.portfolio$new.portfolio.value, col = "red")
# Plot density of the new and old portfolio returns on same chart
plot(density(old.vs.new.portfolio$old.portfolio.rtn))
lines(density(old.vs.new.portfolio$new.portfolio.rtn), col ="red")
# Draw value, return, drawdowns of old portfolio
PerformanceAnalytics::charts.PerformanceSummary(old.vs.new.portfolio[, "old.portfolio.rtn"])
# Draw value, return, drawdowns of new portfolio
PerformanceAnalytics::charts.PerformanceSummary(old.vs.new.portfolio[, "new.portfolio.rtn"])
# Draw both portfolios on same chart
# Draw value, return, drawdowns of new portfolio
PerformanceAnalytics::charts.PerformanceSummary(old.vs.new.portfolio[, c("old.portfolio.rtn", "new.portfolio.rtn")])
Chapter 1 - Custom ggplot2 themes
Introduction to the data - finding stories in datasets:
Filtering and plotting the data:
Custom ggplot2 themes - providing a custom look to a chart:
Example code includes:
library(ggplot2)
load("./RInputFiles/ilo_hourly_compensation.RData")
load("./RInputFiles/ilo_working_hours.RData")
# Join both data frames
ilo_data <- ilo_hourly_compensation %>%
inner_join(ilo_working_hours, by = c("country", "year"))
# Count the resulting rows
ilo_data %>%
count()
## # A tibble: 1 x 1
## n
## <int>
## 1 612
# Examine ilo_data
ilo_data
## # A tibble: 612 x 4
## country year hourly_compensation working_hours
## <chr> <chr> <dbl> <dbl>
## 1 Australia 1980.0 8.44 34.6
## 2 Canada 1980.0 8.87 34.8
## 3 Denmark 1980.0 10.8 31.9
## 4 Finland 1980.0 8.61 35.6
## 5 France 1980.0 8.90 35.4
## 6 Italy 1980.0 8.09 35.7
## 7 Japan 1980.0 5.46 40.8
## 8 Korea, Rep. 1980.0 0.950 55.3
## 9 Norway 1980.0 11.8 30.4
## 10 Spain 1980.0 5.86 36.8
## # ... with 602 more rows
# Turn year into a factor
ilo_data <- ilo_data %>%
mutate(year = as.factor(as.numeric(year)))
# Turn country into a factor
ilo_data <- ilo_data %>%
mutate(country = as.factor(country))
# Examine the European countries vector
european_countries <- c('Finland', 'France', 'Italy', 'Norway', 'Spain', 'Sweden', 'Switzerland', 'United Kingdom', 'Belgium', 'Ireland', 'Luxembourg', 'Portugal', 'Netherlands', 'Germany', 'Hungary', 'Austria', 'Czech Rep.')
european_countries
## [1] "Finland" "France" "Italy" "Norway"
## [5] "Spain" "Sweden" "Switzerland" "United Kingdom"
## [9] "Belgium" "Ireland" "Luxembourg" "Portugal"
## [13] "Netherlands" "Germany" "Hungary" "Austria"
## [17] "Czech Rep."
# Only retain European countries
ilo_data <- ilo_data %>%
filter(country %in% european_countries)
# Examine the structure of ilo_data
str(ilo_data)
## Classes 'tbl_df', 'tbl' and 'data.frame': 380 obs. of 4 variables:
## $ country : Factor w/ 30 levels "Australia","Austria",..: 8 9 15 22 25 27 28 29 8 9 ...
## $ year : Factor w/ 27 levels "1980","1981",..: 1 1 1 1 1 1 1 1 2 2 ...
## $ hourly_compensation: num 8.61 8.9 8.09 11.8 5.86 ...
## $ working_hours : num 35.6 35.4 35.7 30.4 36.8 ...
# Group and summarize the data
ilo_data %>%
group_by(year) %>%
summarize(mean_hourly_compensation = mean(hourly_compensation),
mean_working_hours = mean(working_hours))
## # A tibble: 27 x 3
## year mean_hourly_compensation mean_working_hours
## <fct> <dbl> <dbl>
## 1 1980 9.27 34.0
## 2 1981 8.69 33.6
## 3 1982 8.36 33.5
## 4 1983 7.81 33.9
## 5 1984 7.54 33.7
## 6 1985 7.79 33.7
## 7 1986 9.70 34.0
## 8 1987 12.1 33.6
## 9 1988 13.2 33.7
## 10 1989 13.1 33.5
## # ... with 17 more rows
# Filter for 2006
plot_data <- ilo_data %>%
filter(year == 2006)
# Create the scatter plot
ggplot(plot_data) +
geom_point(aes(x = working_hours, y = hourly_compensation))
# Create the plot
ggplot(plot_data) +
geom_point(aes(x = working_hours, y = hourly_compensation)) +
# Add labels
labs(
x = "Working hours per week",
y = "Hourly compensation",
title = "The more people work, the less compensation they seem to receive",
subtitle = "Working hours and hourly compensation in European countries, 2006",
caption = "Data source: ILO, 2017"
)
# Save your current plot into a variable: ilo_plot
ilo_plot <- ggplot(plot_data) +
geom_point(aes(x = working_hours, y = hourly_compensation)) +
labs(
x = "Working hours per week",
y = "Hourly compensation",
title = "The more people work, the less compensation they seem to receive",
subtitle = "Working hours and hourly compensation in European countries, 2006",
caption = "Data source: ILO, 2017"
)
# Try out theme_minimal
ilo_plot +
theme_minimal()
# Try out any other possible theme function
ilo_plot +
theme_linedraw()
windowsFonts(Bookman=windowsFont("Bookman Old Style"))
ilo_plot <- ilo_plot +
theme_minimal() +
# Customize the "minimal" theme with another custom "theme" call
theme(
text = element_text(family = "Bookman"),
title = element_text(color = "gray25"),
plot.subtitle = element_text(size=12),
plot.caption = element_text(color = "gray30")
)
# Render the plot object
ilo_plot
ilo_plot +
# "theme" calls can be stacked upon each other, so this is already the third call of "theme"
theme(
plot.background = element_rect(fill = "gray95"),
plot.margin = unit(c(5, 10, 5, 10), units = "mm")
)
Chapter 2 - Creating Custom and Unique Visualization